### AIDI 1002 Assignment 3
### Author: Ahmad Sayeb - 200534271

In [38]:
# -----------------Warnings------------------
#removing cuda warnings for gpu
import warnings
warnings.filterwarnings('ignore')
# ---------------- Libraries-----------------
import pandas as pd
import numpy as np
# ---------------- Sklearn libraries---------
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
# ---------------- Keras---------------------
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [39]:
df = load_csv('archive/train.csv')

In [40]:
df.head(10)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A
5,461319,Male,Yes,56,No,Artist,0.0,Average,2.0,Cat_6,C
6,460156,Male,No,32,Yes,Healthcare,1.0,Low,3.0,Cat_6,C
7,464347,Female,No,33,Yes,Healthcare,1.0,Low,3.0,Cat_6,D
8,465015,Female,Yes,61,Yes,Engineer,0.0,Low,3.0,Cat_7,D
9,465176,Female,Yes,55,Yes,Artist,1.0,Average,4.0,Cat_6,C


In [41]:
df.dtypes

ID                   int64
Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size        float64
Var_1               object
Segmentation        object
dtype: object

In [42]:
df.isnull().sum()

ID                   0
Gender               0
Ever_Married       140
Age                  0
Graduated           78
Profession         124
Work_Experience    829
Spending_Score       0
Family_Size        335
Var_1               76
Segmentation         0
dtype: int64

In [43]:
def load_csv(path: str) -> 'dataframe':
    '''
    Loads csv file into dataframe
    path: path to the file
    '''
    df = pd.read_csv(path)
    return df


def num_col_nan(df: 'dataframe'):
    '''
    replaces nan value in numerical columns with mode
    df: dataframe
    '''
    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    col_num = df.select_dtypes(include=numerics)
    
    for col in col_num:
        df[col].replace([np.nan], df[col].mode()[0], inplace=True)

        
def cat_col_nan(df: 'dataframe'):
    '''
    replace nan value in categorical column with None string
    df: dataframe
    '''
    
    categorical = ['object']
    cat_columns = df.select_dtypes(include=categorical)
    
    for col in cat_columns:
        df[col].replace([np.nan], 'None', inplace=True)


def label_encoder(df: 'dataframe'):
    '''
    label encoding categorical data
    df: dataframe
    '''
    label = LabelEncoder()
    categorical = ['object']
    cat_col = df.select_dtypes(include=categorical)
    
    for col in cat_col:
        df[col] = label.fit_transform(df[col])
    

def min_max_scaler(df: 'dataframe'):
    '''
    normalize numerical data
    df: dataframe
    '''
    scaler = MinMaxScaler()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    col_num = df.select_dtypes(include=numerics).columns
    print(col_num)
    df[col_num] = scaler.fit_transform(df[col_num])
    
def pre_processing(df: 'dataframe'):
    '''
    performs nan value replacement, encode variables,
    normalizes columns
    '''
    try:
        print('replacing numerical nans with mode...')
        num_col_nan(df)
        print('replacing categorical nans with None string...')
        cat_col_nan(df)
        print('normalizing numerical data with min max...')
        min_max_scaler(df)
        print('label encoding categorical data...')
        label_encoder(df)
        print('\033[1m' + 'SUCCESSFULLY PERFORMED PREPROCESSING' + '\033[0m')    
        return True
    
    except Exception as e:
        print('error occurred in pre-processing')
        print(e)
        return False

def train_validation_split(val_size: float, df: 'dataframe') -> list:
    '''
    splits dataframe into train and validation and SHUFFLES
    test_size: size of the validation
    df: dataframe
    '''
    # Shuffle is set to true
    # Stratify is set to true
    (X_train,
     X_valid,
     y_train,
     y_valid) = train_test_split(df,
                                 test_size=val_size, 
                                 shuffle=True,
                                 stratify='Segmentation')
    
    return X_train, X_valid, y_train, y_valid
    


In [44]:
pre_processing(df)

replacing numerical nans with mode...
replacing categorical nans with None string...
normalizing numerical data with min max...
Index(['ID', 'Age', 'Work_Experience', 'Family_Size'], dtype='object')
label encoding categorical data...
[1mSUCCESSFULLY PERFORMED PREPROCESSING[0m


True

In [45]:
df.head(10)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,0.425601,1,0,0.056338,0,5,0.071429,2,0.375,3,3
1,0.40714,0,2,0.28169,2,2,0.071429,0,0.25,3,0
2,0.815503,0,2,0.690141,2,2,0.071429,2,0.0,5,1
3,0.306161,1,2,0.690141,2,7,0.0,1,0.125,5,1
4,0.410031,0,2,0.309859,2,3,0.071429,1,0.625,5,0
5,0.259898,1,2,0.535211,0,0,0.0,0,0.125,5,2
6,0.13056,1,0,0.197183,2,5,0.071429,2,0.25,5,2
7,0.596641,0,0,0.211268,2,5,0.071429,2,0.25,5,3
8,0.67093,0,2,0.605634,2,2,0.0,2,0.25,6,3
9,0.688835,0,2,0.521127,2,0,0.071429,0,0.375,5,2
