# Data Preprocessing Tools

## Importing the libraries

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Important classes
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

## Importing the dataset

In [4]:
!wget https://raw.githubusercontent.com/coyoacan/Practice-in-Colab/refs/heads/main/source-files/Social_Network_Ads_NaNs.csv
dataset = pd.read_csv('Social_Network_Ads_NaNs.csv')
dataset.info() # We can also use data.describe() to visualize statistics from numerical variables.

--2024-10-18 00:09:19--  https://raw.githubusercontent.com/coyoacan/Practice-in-Colab/refs/heads/main/source-files/Social_Network_Ads_NaNs.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10677 (10K) [text/plain]
Saving to: ‘Social_Network_Ads_NaNs.csv’


2024-10-18 00:09:19 (101 MB/s) - ‘Social_Network_Ads_NaNs.csv’ saved [10677/10677]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   User ID          400 non-null    int64  
 1   Gender           400 non-null    object 
 2   Age              375 non-null    float64
 3   EstimatedSalary  361 non-null    float64
 4   Purchased        400 non-null    int64  

## Identifying numerical and categorical data

In [5]:
cols=dataset.columns
num_cols = dataset._get_numeric_data().columns
print("these columns are numerical ---> ", num_cols)
print("these columns are not numerical ---> ", list(set(cols) - set(num_cols)))

these columns are numerical --->  Index(['User ID', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')
these columns are not numerical --->  ['Gender']


## Discovering missing data

In [6]:
print(dataset.isnull().sum()) # count of empty items

User ID             0
Gender              0
Age                25
EstimatedSalary    39
Purchased           0
dtype: int64


Take a moment to identify the order of the columns. Then, find the Target column

In [7]:
# If we print the first 10 registries of the dataset we find five columns
# Visualy inspecting the dataset will help to identify the Target variable
dataset.head(10)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,,0
4,15804002,Male,19.0,76000.0,0
5,15728773,Male,27.0,58000.0,0
6,15598044,Female,27.0,84000.0,0
7,15694829,Female,32.0,150000.0,1
8,15600575,Male,25.0,33000.0,0
9,15727311,Female,35.0,,0


## Train and Test split with simple imputer

In [8]:
# Imputer strategy is: Mean value substitutes missing data
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

X_train, X_test, y_train, y_test = train_test_split(
    dataset.drop("Purchased", axis=1),
    dataset["Purchased"],
    test_size=0.2,
    random_state=45,
)

# Identify numerical features
numeric_vars = X_train.select_dtypes(
    exclude="O").columns.to_list()
formatted_string = f"The list numerical variables is: {numeric_vars}! \n"
print(formatted_string)

# Use a column transformer. Only consider numeric variables.
ct = ColumnTransformer(
    [("imputer",imputer, numeric_vars)],
    remainder="passthrough"
    ) .set_output(transform="pandas")
# Tell the column transformer you will be using pandas


ct.fit(X_train)
print("This is X_train BEFORE applying the simple imputer \n")
print(X_train.isnull().sum()) # count of empty items
formatted_string = \
f"\nMean values that replace empty slots: {ct.named_transformers_.imputer.statistics_} \n"
print(formatted_string)

X_train = ct.transform(X_train)
print("This is X_train AFTER applying the simple imputer \n")
print(X_train.isnull().sum()) # count of empty items

# Repeat the same process for X_test
ct.fit(X_test)
X_test = ct.transform(X_test)
print("\nThis is X_test AFTER applying the simple imputer \n")
print(X_test.isnull().sum()) # count of empty items


The list numerical variables is: ['User ID', 'Age', 'EstimatedSalary']! 

This is X_train BEFORE applying the simple imputer 

User ID             0
Gender              0
Age                17
EstimatedSalary    32
dtype: int64

Mean values that replace empty slots: [1.56900905e+07 3.76864686e+01 7.04236111e+04] 

This is X_train AFTER applying the simple imputer 

imputer__User ID            0
imputer__Age                0
imputer__EstimatedSalary    0
remainder__Gender           0
dtype: int64

This is X_test AFTER applying the simple imputer 

imputer__User ID            0
imputer__Age                0
imputer__EstimatedSalary    0
remainder__Gender           0
dtype: int64


In [9]:
formatted_string = f" See this MOFO Imputer changed my columns order -> {X_train.columns}"

##Encoding: Using One Hot Encoder




In [10]:
# Identify numerical features with new names
numeric_vars = X_train.select_dtypes(
    exclude="O").columns.to_list()
formatted_string = f"The list numerical variables has changed names ---> {numeric_vars}! \n"
print(formatted_string)

# drop off USER ID, we don't need it for the next part of the practice.
X_train = X_train.drop("imputer__User ID", axis=1)
X_test = X_test.drop("imputer__User ID", axis=1)

# transform everything to numpy
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

The list numerical variables has changed names ---> ['imputer__User ID', 'imputer__Age', 'imputer__EstimatedSalary']! 



By looking at the numpy array, we see there is a string variable at the end.
We use OneHotEncoder to transform this string variable to a numerical variable


In [11]:
X_train[0:5]

array([[21.0, 68000.0, 'Female'],
       [36.0, 126000.0, 'Female'],
       [37.68646864686469, 39000.0, 'Female'],
       [25.0, 22000.0, 'Male'],
       [20.0, 70423.61111111111, 'Male']], dtype=object)

In [12]:
# We use a column transformer callend Gender to encode the categorical variable
# After transforming the variable we can get the categories from the named transformer
ct = ColumnTransformer([("Gender", OneHotEncoder(), [2])], remainder = 'passthrough')
X_train = ct.fit_transform(X_train)
print(ct.named_transformers_['Gender'].categories_)
print('----------')
# The one hot encoder puts the encoded columns at the beginning of the array
print(X_train)

[array(['Female', 'Male'], dtype=object)]
----------
[[1.0 0.0 21.0 68000.0]
 [1.0 0.0 36.0 126000.0]
 [1.0 0.0 37.68646864686469 39000.0]
 ...
 [1.0 0.0 21.0 16000.0]
 [0.0 1.0 42.0 64000.0]
 [0.0 1.0 37.68646864686469 31000.0]]


In [13]:
# We repeat the latter for X_test
X_test = ct.fit_transform(X_test)
print(X_test[0:5])

[[1.0 0.0 52.0 90000.0]
 [0.0 1.0 53.0 70054.79452054795]
 [1.0 0.0 40.0 72000.0]
 [1.0 0.0 57.0 33000.0]
 [0.0 1.0 40.0 61000.0]]


In [14]:
print('no need to label the target')
print(y_test)

no need to label the target
[1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0
 1 0 1 0 1 1 1 0 1 1 1 0 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0
 1 1 0 0 1 1]


## Standard Scaler

In [15]:
# Make a standard scaler
# Fit the scaler with the information from X_train
# Only use the transform option with X_test, consider the same behaviour between...
# X_train and X_test
sc = StandardScaler()
X_train[:,2:] = sc.fit_transform(X_train[:,2:])
X_test[:,2:] = sc.transform(X_test[:,2:])

In [16]:
print(X_test)

[[1.0 0.0 1.4266526016806729 0.607906032262457]
 [0.0 1.0 1.526324203781029 -0.011452869652028044]
 [1.0 0.0 0.2305933764764003 0.04895163863908236]
 [1.0 0.0 1.9250106121824533 -1.1621162142115626]
 [0.0 1.0 0.2305933764764003 -0.29263160190853543]
 [1.0 0.0 0.06032105622162513 -1.5347524766271456]
 [1.0 0.0 -0.5667794403264481 0.4836939447905959]
 [1.0 0.0 -0.26776463402538 0.04895163863908236]
 [0.0 1.0 0.06032105622162513 0.5147469666585612]
 [1.0 0.0 2.1243538163831652 -1.2863283016834235]
 [0.0 1.0 -0.8657942466275163 0.14211070424297811]
 [0.0 1.0 0.7289513869781805 0.26632279171483914]
 [1.0 0.0 -1.3641522571292966 -1.1931692360795279]
 [0.0 1.0 1.8253390100820972 -0.3236846237765007]
 [0.0 1.0 0.13092177437604424 -0.8826390173998753]
 [0.0 1.0 -1.8625102676310767 0.17316372611094338]
 [1.0 0.0 -1.1648090529285844 0.04895163863908236]
 [0.0 1.0 0.9282945911788927 -1.5658054984951109]
 [0.0 1.0 -1.9621818697314328 -0.5721087987202227]
 [1.0 0.0 -1.7628386655307209 -1.47264643289