### AIDI 1002 Assignment 3
### Author: Ahmad Sayeb - 200534271

In [3]:
# -----------------Warnings------------------
#removing cuda warnings for gpu
import warnings
warnings.filterwarnings('ignore')
# ---------------- Libraries-----------------
import pandas as pd
import numpy as np
# ---------------- Sklearn libraries---------
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
# ---------------- Keras---------------------
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [46]:
def load_csv(path: str) -> 'dataframe':
    '''
    Loads csv file into dataframe
    path: path to the file
    '''
    df = pd.read_csv(path)
    return df


def num_col_nan(df: 'dataframe'):
    '''
    replaces nan value in numerical columns with mode
    df: dataframe
    '''
    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    col_num = df.select_dtypes(include=numerics)
    
    for col in col_num:
        df[col].replace([np.nan], df[col].mode()[0], inplace=True)

        
def cat_col_nan(df: 'dataframe'):
    '''
    replace nan value in categorical column with None string
    df: dataframe
    '''
    
    categorical = ['object']
    cat_columns = df.select_dtypes(include=categorical)
    
    for col in cat_columns:
        df[col].replace([np.nan], 'None', inplace=True)


def label_encoder(df: 'dataframe'):
    '''
    label encoding categorical data
    df: dataframe
    '''
    label = LabelEncoder()
    categorical = ['object']
    cat_col = df.select_dtypes(include=categorical)
    
    for col in cat_col:
        df[col] = label.fit_transform(df[col])


def replacing_classes(row: 'object'):
    if row != 'Cat_4' and row != 'Cat_6':
        return 'Other'
    else:
        return row

def drop_cols(cols: list, df: 'dataframe'):
    '''
    drops specified columns
    col: list of columns
    df: dataframe
    '''
    print(f'dropping {cols}...')
    df.drop(columns=cols, inplace=True)
    
    
def pre_processing(df: 'dataframe'):
    '''
    performs nan value replacement and encoding categorical values
    '''
    try:
        print('dropping columns...')
        drop_cols(['ID', 'Segmentation'], df)
        print('replacing numerical nans with mode...')
        num_col_nan(df)
        print('replacing categorical nans with None string...')
        cat_col_nan(df)
        print('changing anything otehr than cat_6 and cat_4 to other...')
        df['Var_1'] = df['Var_1'].apply(replacing_classes)
        print('label encoding categorical data...')
        label_encoder(df)
        print('\033[1m' + 'SUCCESSFULLY PERFORMED PREPROCESSING' + '\033[0m')    
        return True
    
    except Exception as e:
        print('error occurred in pre-processing')
        print(e)
        return False

    
def train_validation_split(val_size: float, df: 'dataframe'):
    '''
    splits dataframe into train and validation and SHUFFLES
    test_size: size of the validation
    df: dataframe
    '''
    # Shuffle is set to true
    # Stratify is set to true
    df_dep = df[['Var_1']]
    df_indep = df.loc[:, df.columns != 'Var_1']
    X_train, X_valid, y_train, y_valid = train_test_split(
                                                        df_indep,
                                                        df_dep,
                                                        test_size=val_size, 
                                                        shuffle=True,
                                                        stratify=df_dep['Var_1']
                                                        )
    
    return X_train, X_valid, y_train, y_valid


def min_max_scaler(df: 'dataframe'):
    '''
    normalize numerical data
    df: dataframe
    '''
    scaler = MinMaxScaler()
    col_num = ['Age', 'Work_Experience', 'Family_Size']
    df[col_num] = scaler.fit_transform(df[col_num])

    
def build_neural_network(input_size):
    '''
    this function creates the deep neural network
    input_size: size of the input array
    '''
    model = Sequential()
    model.add(Dense(16, input_dim=input_size, activation='relu'))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def 

In [47]:
df = load_csv('archive/train.csv')
pre_processing(df)
X_train, X_valid, y_train, y_valid = train_validation_split(0.33, df)
min_max_scaler(X_train)

dropping columns...
dropping ['ID', 'Segmentation']...
replacing numerical nans with mode...
replacing categorical nans with None string...
changing anything otehr than cat_6 and cat_4 to other...
label encoding categorical data...
[1mSUCCESSFULLY PERFORMED PREPROCESSING[0m


In [49]:
X_train.head(10)

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size
4842,0,0,0.197183,0,5,0.071429,2,0.375
2254,0,2,0.774648,2,0,0.0,0,0.25
4889,0,2,0.323944,2,8,0.642857,2,0.25
3646,1,2,0.112676,0,1,0.285714,0,0.25
8062,1,2,0.323944,2,0,0.0,1,0.5
2817,1,0,0.028169,0,3,0.357143,2,0.125
1274,0,2,0.549296,2,1,0.0,2,0.75
4922,1,2,0.28169,0,0,0.571429,1,0.5
1036,1,0,0.098592,0,8,0.0,2,0.125
1886,0,0,0.112676,0,2,0.285714,2,0.0
