# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np

## Split Data into X-matrix and y-vector

#### Load the sample data

In [2]:
data = pd.read_csv("./Data Files/Social_Network_Ads.csv")
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


#### Show the column names to easily copy/paste in the next step

In [22]:
data.columns

Index(['User ID', 'Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')

#### Split the data into X and y

In [23]:
X = data.loc[:, ["Gender", "Age", "EstimatedSalary"]]
y = data[["Purchased"]]

## Partition Data into Training Set and Test Set

In [24]:
data = pd.read_csv("./Data Files/Social_Network_Ads.csv")
X = data[["Gender", "Age", "EstimatedSalary"]]
y = data[["Purchased"]]

#### Split into train and test sets

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1111)

## Center and Scale Variables

In [26]:
X = pd.read_csv("./Data Files/Scale_Test_File.csv")
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(X, test_size = 0.25, random_state = 1111)

#### Scale X

Dummy variables should be ignored because the dtype should be "int8", but check to make sure using X_train.dtypes first

In [27]:
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
for i in X_train.columns:
    if X_train[i].dtypes in ["float64", "int64"]:
        mean_X_train = X_train[i].mean()
        std_X_train = X_train[i].std()
        X_train_scaled[i] = (X_train[i] - mean_X_train) / std_X_train
        X_test_scaled[i] = (X_test[i] - mean_X_train) / std_X_train

#### Scale y (if necessary; note: y does not exist in this example)

In [28]:
mean_y_train = y_train.mean()
std_y_train = y_train.std()
y_train_scaled = (y_train - mean_y_train) / std_y_train
y_test_scaled = (y_test - mean_y_train) / std_y_train

## Encode Dummy Variables

#### Load the sample data

In [10]:
data = pd.read_csv("./Data Files/Dummy_Vars_Test_File.csv")
X = data[["Wins", "Country", "Gender", "Location"]]
y = data["Outcome"]

#### Create the dummy variables for X

In [8]:
X_dummies = pd.get_dummies(X[["Country", "Gender", "Location"]], drop_first = True)
X = pd.concat([X, X_dummies], axis = 1)
X = X.drop(["Country", "Gender", "Location"], axis = 1)

#### Create the dummy variable for y (if necessary)

In [5]:
y_dummies = pd.get_dummies(y, drop_first = True)
y = y_dummies.iloc[:, 0]

## Missing Data

#### Load the sample dataset

In [32]:
data = pd.read_csv("./Data Files/Missing_Values_Test_File.csv")
data.head()

Unnamed: 0,YearsExperience,Salary,Gender
0,1.1,39343.0,Female
1,,46205.0,Male
2,1.5,37731.0,
3,2.0,,Female
4,3.7,39891.0,Male


#### List all columns with missing data

In [33]:
data.isnull().sum()

YearsExperience    3
Salary             2
Gender             4
dtype: int64

#### Show rows with missing data from specified column

In [34]:
data[data['Gender'].isnull()]

Unnamed: 0,YearsExperience,Salary,Gender
2,1.5,37731.0,
9,3.7,57189.0,
16,5.1,66029.0,
24,8.7,109431.0,


#### Remove all rows with any missing values

In [35]:
data.dropna().head()

Unnamed: 0,YearsExperience,Salary,Gender
0,1.1,39343.0,Female
4,3.7,39891.0,Male
5,2.2,56642.0,Male
6,3.0,60150.0,Female
7,3.2,54445.0,Female


#### Remove all columns with any missing values

In [36]:
data.dropna(axis = 1).head()

0
1
2
3
4


#### Impute missing numeric values (using mean or median)

In [37]:
data["YearsExperience"].fillna(value = data["YearsExperience"].median())

0      1.1
1      4.9
2      1.5
3      2.0
4      3.7
5      2.2
6      3.0
7      3.2
8      3.2
9      3.7
10     3.9
11     4.0
12     4.0
13     4.1
14     4.9
15     4.9
16     5.1
17     5.3
18     5.9
19     6.0
20     4.9
21     7.1
22     7.9
23     8.2
24     8.7
25     9.0
26     9.5
27     9.6
28    10.3
29    10.5
Name: YearsExperience, dtype: float64

#### Impute missing categorical values (using most common category)

In [38]:
data["Gender"].fillna(value = data["Gender"].value_counts().index[0])

0     Female
1       Male
2     Female
3     Female
4       Male
5       Male
6     Female
7     Female
8       Male
9     Female
10    Female
11    Female
12    Female
13      Male
14    Female
15      Male
16    Female
17    Female
18      Male
19    Female
20    Female
21      Male
22    Female
23      Male
24    Female
25      Male
26    Female
27      Male
28    Female
29    Female
Name: Gender, dtype: object