### AIDI 1002 Assignment 3
### Author: Ahmad Sayeb - 200534271

In [275]:
# -----------------Warnings------------------
#removing cuda warnings for gpu
import warnings
warnings.filterwarnings('ignore')
# ---------------- Libraries-----------------
import pandas as pd
import numpy as np
# ---------------- Sklearn libraries---------
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
# ---------------- Keras---------------------
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
# ----------------Tensor-flow support--------
import tensorflow_addons as tfa

In [353]:
def load_csv(path: str) -> 'dataframe':
    '''
    Loads csv file into dataframe
    path: path to the file
    '''
    df = pd.read_csv(path)
    return df


def num_col_nan(df: 'dataframe'):
    '''
    replaces nan value in numerical columns with mode
    df: dataframe
    '''
    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    col_num = df.select_dtypes(include=numerics)
    
    for col in col_num:
        df[col].replace([np.nan], df[col].mode()[0], inplace=True)

        
def cat_col_nan(df: 'dataframe'):
    '''
    replace nan value in categorical column with None string
    df: dataframe
    '''
    
    categorical = ['object']
    cat_columns = df.select_dtypes(include=categorical)
    
    for col in cat_columns:
        df[col].replace([np.nan], 'None', inplace=True)


def encoder(df: 'dataframe') -> 'dataframe':
    '''
    label encoding categorical data that are indep variable
    and one hot encode target variables
    df: dataframe
    '''
    label = LabelEncoder()
    hot = OneHotEncoder()
    categorical = ['object']
    cat_col = df.select_dtypes(include=categorical)
    target = cat_col[['Var_1']]
    cat_col.drop(columns=['Var_1'], inplace=True)
    for col in cat_col:
        df[col] = label.fit_transform(df[col])
    
    target = pd.get_dummies(target)
    df.drop(columns=['Var_1'], inplace=True)
    return pd.concat([df, target], axis=1)


def replacing_classes(row: 'object'):
    if row != 'Cat_4' and row != 'Cat_6':
        return 'Other'
    else:
        return row

def drop_cols(cols: list, df: 'dataframe'):
    '''
    drops specified columns
    col: list of columns
    df: dataframe
    '''
    print(f'dropping {cols}...')
    df.drop(columns=cols, inplace=True)
    
    
def pre_processing(df: 'dataframe'):
    '''
    performs nan value replacement and encoding categorical values
    '''
    try:
        print('dropping columns...')
        drop_cols(['ID', 'Segmentation'], df)
        print('replacing numerical nans with mode...')
        num_col_nan(df)
        print('replacing categorical nans with None string...')
        cat_col_nan(df)
        print('changing anything otehr than cat_6 and cat_4 to other...')
        df['Var_1'] = df['Var_1'].apply(replacing_classes)
        print('label encoding categorical data...')
        df = encoder(df)
        print('\033[1m' + 'SUCCESSFULLY PERFORMED PREPROCESSING' + '\033[0m')    
        return df
    
    except Exception as e:
        print('error occurred in pre-processing')
        print(e)
        return False

    
def train_validation_split(val_size: float, df: 'dataframe'):
    '''
    splits dataframe into train and validation and SHUFFLES
    test_size: size of the validation
    df: dataframe
    '''
    # Shuffle is set to true
    # Stratify is set to true
    df_dep = df[['Var_1_Cat_4','Var_1_Cat_6', 'Var_1_Other']]
    df_indep = df.drop(columns=['Var_1_Cat_4','Var_1_Cat_6', 'Var_1_Other'])
    X_train, X_valid, y_train, y_valid = train_test_split(
                                                        df_indep,
                                                        df_dep,
                                                        test_size=val_size, 
                                                        shuffle=True,
                                                        stratify=df_dep[['Var_1_Cat_4', 'Var_1_Cat_6', 'Var_1_Other']]
                                                        )
    
    return X_train, X_valid, y_train, y_valid


def min_max_scaler(df: 'dataframe'):
    '''
    normalize numerical data
    df: dataframe
    '''
    scaler = MinMaxScaler()
    col_num = ['Age', 'Work_Experience', 'Family_Size']
    df[col_num] = scaler.fit_transform(df[col_num])

    
def build_neural_network():
    '''
    this function creates the deep neural network
    input_size: size of the input array
    '''
    model = Sequential()
    model.add(Dense(512, input_shape=(8,)))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[tfa.metrics.F1Score(average='micro', num_classes=3),'accuracy'])
    print(model.summary())
    return model


def fit_model(X_train: 'dataframe',
              y_train: 'dataframe',
              X_val: 'dataframe', 
              y_val: 'dataframe'):
    '''
    This trains the model
    X_trian: training data
    '''
    X_train_array = X_train.values
    y_train_array = y_train.values
    X_val_array = X_val.values
    y_val_array = y_val.values
    callbacks = EarlyStopping(monitor='val_loss', mode='min', patience=50)
    history = model.fit(X_train_array,
                        y_train_array,
                        epochs=100,
                        batch_size=32,
                        verbose=1,
                        validation_data=(X_val_array, y_val_array),
                       callbacks=[callbacks])
    return model

In [354]:
df = load_csv('archive/train.csv')
df = pre_processing(df)
X_train, X_valid, y_train, y_valid = train_validation_split(0.10, df)
min_max_scaler(X_train)
min_max_scaler(X_valid)

dropping columns...
dropping ['ID', 'Segmentation']...
replacing numerical nans with mode...
replacing categorical nans with None string...
changing anything otehr than cat_6 and cat_4 to other...
label encoding categorical data...
[1mSUCCESSFULLY PERFORMED PREPROCESSING[0m


In [355]:
model = build_neural_network()
model = fit_model(X_train, y_train, X_valid, y_valid)

Model: "sequential_51"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_289 (Dense)           (None, 512)               4608      
                                                                 
 dense_290 (Dense)           (None, 256)               131328    
                                                                 
 dense_291 (Dense)           (None, 128)               32896     
                                                                 
 dense_292 (Dense)           (None, 64)                8256      
                                                                 
 dense_293 (Dense)           (None, 16)                1040      
                                                                 
 dense_294 (Dense)           (None, 8)                 136       
                                                                 
 dense_295 (Dense)           (None, 3)               

In [141]:
y_pred = model.predict(X_valid)



In [153]:
y_pred[4]

array([0.0000000e+00, 9.8661275e-09, 9.9999994e-01], dtype=float32)

In [160]:
y_train

Unnamed: 0,Var_1_Cat_4,Var_1_Cat_6,Var_1_Other
539,0,0,1
1459,0,1,0
6693,0,1,0
4999,1,0,0
5409,0,1,0
...,...,...,...
5724,0,1,0
7471,1,0,0
1771,0,1,0
3897,0,1,0


In [255]:
y_valid

Unnamed: 0,Var_1_Cat_4,Var_1_Cat_6,Var_1_Other
1261,0,0,1
243,0,1,0
7163,0,0,1
1880,1,0,0
2222,1,0,0
...,...,...,...
1179,0,1,0
484,0,0,1
2521,0,0,1
7179,0,0,1


In [257]:
X_train

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size
2726,0,0,0.056338,0,5,0.428571,2,0.250
4718,1,2,0.521127,2,0,0.142857,2,0.125
59,1,2,0.760563,2,7,0.071429,2,0.125
5483,1,2,0.521127,2,0,0.071429,0,0.500
6111,1,0,0.183099,2,5,0.000000,2,0.125
...,...,...,...,...,...,...,...,...
1438,1,2,0.436620,0,0,0.000000,2,0.125
7846,0,0,0.140845,2,0,0.214286,2,0.375
7593,1,2,0.197183,2,5,0.428571,2,0.125
506,1,2,0.549296,2,4,0.071429,1,0.250


In [261]:
X_valid

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size
3398,1,2,0.760563,2,3,0.071429,0,0.125
1738,1,2,0.563380,2,7,0.000000,1,0.125
4807,0,2,0.478873,0,9,0.500000,0,0.750
5135,0,2,0.352113,2,5,0.642857,2,0.125
2964,0,0,0.070423,0,5,0.642857,2,0.375
...,...,...,...,...,...,...,...,...
1304,1,2,0.577465,2,0,0.071429,0,0.125
7650,0,2,0.098592,0,6,0.571429,0,0.125
1010,1,2,0.478873,2,3,0.071429,0,0.375
3808,0,0,0.112676,2,2,0.000000,2,0.625


In [167]:
pred = list()
for i in range(len(y_pred)):
    pred.append(np.argmax(y_pred[i]))
valid = list()
y_valid_arr = y_valid.values
for i in range(len(y_valid_arr)):
     valid.append(np.argmax(y_valid_arr[i]))

In [174]:
from sklearn.metrics import accuracy_score
a = accuracy_score(pred,valid)
print('Accuracy is:', a*100)

Accuracy is: 62.29815996995869
