### AIDI 1002 Assignment 3
### Author: Ahmad Sayeb - 200534271

In [74]:
# -----------------Warnings------------------
#removing cuda warnings for gpu
import warnings
warnings.filterwarnings('ignore')
# ---------------- Libraries-----------------
import pandas as pd
import numpy as np
# ---------------- Sklearn libraries---------
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
# ---------------- Keras---------------------
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [180]:
def load_csv(path: str) -> 'dataframe':
    '''
    Loads csv file into dataframe
    path: path to the file
    '''
    df = pd.read_csv(path)
    return df


def num_col_nan(df: 'dataframe'):
    '''
    replaces nan value in numerical columns with mode
    df: dataframe
    '''
    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    col_num = df.select_dtypes(include=numerics)
    
    for col in col_num:
        df[col].replace([np.nan], df[col].mode()[0], inplace=True)

        
def cat_col_nan(df: 'dataframe'):
    '''
    replace nan value in categorical column with None string
    df: dataframe
    '''
    
    categorical = ['object']
    cat_columns = df.select_dtypes(include=categorical)
    
    for col in cat_columns:
        df[col].replace([np.nan], 'None', inplace=True)


def encoder(df: 'dataframe') -> 'dataframe':
    '''
    label encoding categorical data that are indep variable
    and one hot encode target variables
    df: dataframe
    '''
    label = LabelEncoder()
    hot = OneHotEncoder()
    categorical = ['object']
    cat_col = df.select_dtypes(include=categorical)
    target = cat_col[['Var_1']]
    cat_col.drop(columns=['Var_1'], inplace=True)
    for col in cat_col:
        df[col] = label.fit_transform(df[col])
    
    target = pd.get_dummies(target)
    df.drop(columns=['Var_1'], inplace=True)
    return pd.concat([df, target], axis=1)


def replacing_classes(row: 'object'):
    if row != 'Cat_4' and row != 'Cat_6':
        return 'Other'
    else:
        return row

def drop_cols(cols: list, df: 'dataframe'):
    '''
    drops specified columns
    col: list of columns
    df: dataframe
    '''
    print(f'dropping {cols}...')
    df.drop(columns=cols, inplace=True)
    
    
def pre_processing(df: 'dataframe'):
    '''
    performs nan value replacement and encoding categorical values
    '''
    try:
        print('dropping columns...')
        drop_cols(['ID', 'Segmentation'], df)
        print('replacing numerical nans with mode...')
        num_col_nan(df)
        print('replacing categorical nans with None string...')
        cat_col_nan(df)
        print('changing anything otehr than cat_6 and cat_4 to other...')
        df['Var_1'] = df['Var_1'].apply(replacing_classes)
        print('label encoding categorical data...')
        df = encoder(df)
        print('\033[1m' + 'SUCCESSFULLY PERFORMED PREPROCESSING' + '\033[0m')    
        return df
    
    except Exception as e:
        print('error occurred in pre-processing')
        print(e)
        return False

    
def train_validation_split(val_size: float, df: 'dataframe'):
    '''
    splits dataframe into train and validation and SHUFFLES
    test_size: size of the validation
    df: dataframe
    '''
    # Shuffle is set to true
    # Stratify is set to true
    df_dep = df[['Var_1_Cat_4','Var_1_Cat_6', 'Var_1_Other']]
    df_indep = df.drop(columns=['Var_1_Cat_4','Var_1_Cat_6', 'Var_1_Other'])
    X_train, X_valid, y_train, y_valid = train_test_split(
                                                        df_indep,
                                                        df_dep,
                                                        test_size=val_size, 
                                                        shuffle=True,
                                                        stratify=df_dep[['Var_1_Cat_4', 'Var_1_Cat_6', 'Var_1_Other']]
                                                        )
    
    return X_train, X_valid, y_train, y_valid


def min_max_scaler(df: 'dataframe'):
    '''
    normalize numerical data
    df: dataframe
    '''
    scaler = MinMaxScaler()
    col_num = ['Age', 'Work_Experience', 'Family_Size']
    df[col_num] = scaler.fit_transform(df[col_num])

    
def build_neural_network(input_size: int):
    '''
    this function creates the deep neural network
    input_size: size of the input array
    '''
    model = Sequential()
    model.add(Dense(256, input_shape=(8,), activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['get_f1'])
    return model


def fit_model(X_train: 'dataframe', y_train: 'dataframe'):
    '''
    This trains the model
    X_trian: training data
    '''
    X_train_array = X_train.values
    y_train_array = y_train.values
    history = model.fit(X_train_array, y_train_array, epochs=200)
    
    return model


In [181]:
df = load_csv('archive/train.csv')
df = pre_processing(df)
X_train, X_valid, y_train, y_valid = train_validation_split(0.33, df)
min_max_scaler(X_train)

dropping columns...
dropping ['ID', 'Segmentation']...
replacing numerical nans with mode...
replacing categorical nans with None string...
changing anything otehr than cat_6 and cat_4 to other...
label encoding categorical data...
[1mSUCCESSFULLY PERFORMED PREPROCESSING[0m


In [182]:
input_size = len(X_train)
print(input_size)
model = build_neural_network(input_size)

5405


In [183]:
model = fit_model(X_train, y_train)

Epoch 1/200


ValueError: in user code:

    File "/home/ahmad/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/home/ahmad/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/ahmad/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/home/ahmad/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1028, in train_step
        return self.compute_metrics(x, y, y_pred, sample_weight)
    File "/home/ahmad/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1122, in compute_metrics
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "/home/ahmad/.local/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 577, in update_state
        self.build(y_pred, y_true)
    File "/home/ahmad/.local/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 483, in build
        self._metrics = tf.__internal__.nest.map_structure_up_to(
    File "/home/ahmad/.local/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 631, in _get_metric_objects
        return [self._get_metric_object(m, y_t, y_p) for m in metrics]
    File "/home/ahmad/.local/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 631, in <listcomp>
        return [self._get_metric_object(m, y_t, y_p) for m in metrics]
    File "/home/ahmad/.local/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 650, in _get_metric_object
        metric_obj = metrics_mod.get(metric)
    File "/home/ahmad/.local/lib/python3.10/site-packages/keras/metrics/__init__.py", line 181, in get
        return deserialize(str(identifier))
    File "/home/ahmad/.local/lib/python3.10/site-packages/keras/metrics/__init__.py", line 136, in deserialize
        return deserialize_keras_object(
    File "/home/ahmad/.local/lib/python3.10/site-packages/keras/saving/legacy/serialization.py", line 557, in deserialize_keras_object
        raise ValueError(

    ValueError: Unknown metric function: 'get_f1'. Please ensure you are using a `keras.utils.custom_object_scope` and that this object is included in the scope. See https://www.tensorflow.org/guide/keras/save_and_serialize#registering_the_custom_object for details.


In [141]:
y_pred = model.predict(X_valid)



In [153]:
y_pred[4]

array([0.0000000e+00, 9.8661275e-09, 9.9999994e-01], dtype=float32)

In [160]:
y_train

Unnamed: 0,Var_1_Cat_4,Var_1_Cat_6,Var_1_Other
539,0,0,1
1459,0,1,0
6693,0,1,0
4999,1,0,0
5409,0,1,0
...,...,...,...
5724,0,1,0
7471,1,0,0
1771,0,1,0
3897,0,1,0


In [161]:
y_valid

Unnamed: 0,Var_1_Cat_4,Var_1_Cat_6,Var_1_Other
7522,0,0,1
1509,1,0,0
4988,0,0,1
3243,0,0,1
3944,0,1,0
...,...,...,...
4507,1,0,0
7526,0,1,0
5155,1,0,0
91,0,0,1


In [167]:
pred = list()
for i in range(len(y_pred)):
    pred.append(np.argmax(y_pred[i]))
valid = list()
y_valid_arr = y_valid.values
for i in range(len(y_valid_arr)):
     valid.append(np.argmax(y_valid_arr[i]))

In [174]:
from sklearn.metrics import accuracy_score
a = accuracy_score(pred,valid)
print('Accuracy is:', a*100)

Accuracy is: 62.29815996995869
