In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, label_binarize
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix

# random seed for reproducibility
seed = 123456789
np.random.seed(seed)

raw_data = pd.read_csv("International students Time management data.csv")

# Question 6: You often feel that your life is aimless, with no definite purpose
target_column = '6'
raw_data = raw_data[raw_data[target_column] != 'Neither']
raw_data = raw_data[raw_data[target_column].notna()]
raw_data[target_column] = raw_data[target_column].replace('Strong Agree', 'Agree')
raw_data[target_column] = raw_data[target_column].replace('Strong Disagree', 'Disagree')

raw_data['label'] = (raw_data[target_column] == 'Agree')

In [3]:
impute_and_one_hot = Pipeline([
    ('impute', SimpleImputer(strategy= 'most_frequent')),
    ('encode', OneHotEncoder(handle_unknown='ignore'))
])

feature_encoding = ColumnTransformer([
    ("impute_and_one_hot", impute_and_one_hot, ['Course', 'Academic', 'Attendance', 'English', 'Age',  
                                                '8', '12', '14', '15'])
])

pipeline = Pipeline([
    ('features', feature_encoding),
    ('sgdclassifier', SGDClassifier(loss='log'))
])

In [4]:
hyperparam_grid = {
    "sgdclassifier__alpha" : [0.01, 0.03, 0.1, 0.3],
    "sgdclassifier__penalty" : ["l2", "l1", "elasticnet"],
    "sgdclassifier__eta0": [0.01, 0.03, 0.1, 0.3]
}

grid_search = GridSearchCV(pipeline, param_grid=hyperparam_grid, cv=3)

In [5]:
X_train, X_test, y_train_raw, y_test_raw = \
  train_test_split(raw_data, raw_data[target_column], test_size=.3, random_state=seed)

y_train = np.squeeze(label_binarize(y_train_raw, classes=['Agree', 'Disagree']))
y_test = np.squeeze(label_binarize(y_test_raw, classes=['Agree', 'Disagree']))

model = grid_search.fit(X_train, y_train)

In [6]:
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

In [7]:
train_data = pd.concat([X_train.reset_index(), pd.Series(pred_train, name="pred")], axis=1)
 
train_data['TP'] = (train_data['label'] == True) & (train_data['pred'] == True)
train_data['TN'] = (train_data['label'] == False) & (train_data['pred'] == False)
train_data['FN'] = (train_data['label'] == True) & (train_data['pred'] == False)
train_data['FP'] = (train_data['label'] == False) & (train_data['pred'] == True)

In [8]:
test_data = pd.concat([X_test.reset_index(), pd.Series(pred_test, name="pred")], axis=1)

test_data['TP'] = (test_data['label'] == True) & (test_data['pred'] == True)
test_data['TN'] = (test_data['label'] == False) & (test_data['pred'] == False)
test_data['FN'] = (test_data['label'] == True) & (test_data['pred'] == False)
test_data['FP'] = (test_data['label'] == False) & (test_data['pred'] == True)

In [9]:
def FalseNeg(data):
    FNR = data["FN"].sum() / (data["FN"].sum() + data["TP"].sum())
    return FNR

def FalsePos(data):
    FPR = data["FP"].sum() / (data["FP"].sum() + data["TN"].sum())
    return FPR

In [12]:
## TRAIN DATASET
train_data_china = train_data.loc[train_data["Nationality"] == "China"]
train_data_others = train_data.loc[train_data["Nationality"] != "China"]

# CALCULATE FNR FOR TRAIN DATASET
FNR_china_train = FalseNeg(train_data_china)
FNR_others_train = FalseNeg(train_data_others)

# CALCULATE FPR FOR TRAIN DATASET
FPR_china_train = FalsePos(train_data_china)
FPR_others_train = FalsePos(train_data_others)

## TEST DATASET
test_data_china = test_data.loc[test_data["Nationality"] == "China"]
test_data_others = test_data.loc[test_data["Nationality"] != "China"]

# CALCULATE FNR FOR TEST DATASET
FNR_china_test =  FalseNeg(test_data_china)
FNR_others_test =  FalseNeg(test_data_others)

# CALCULATE FPR FOR TEST DATASER
FPR_china_test = FalsePos(test_data_china)
FPR_others_test = FalsePos(test_data_others)

print("FNR China train: ", FNR_china_train)
print("FNR Others train: ", FNR_others_train)
print("Student from China have 15% higher probability to be wrongly predicted to have bad time management.")
print("\n")
print("FNR China test: ", FNR_china_test)
print("FNR Others test: ",FNR_others_test)
print("Student from China have 8% higher probability to be wrongly predicted to have bad time management.")
print("\n")
print("FPR China train: ", FPR_china_train)
print("FPR Others train: ", FPR_others_train)
print("Student from China have a 10% higher probability to be predicted to be good at time management when they are not.")
print("\n")
print("FPR China test: ", FPR_china_test)
print("FPR Others test: ",FPR_others_test)
print("Student from other countries are 16% more likely to be predicted to be good at time management when they are not.")

FNR China train:  0.55
FNR Others train:  0.4
Student from China have 15% higher probability to be wrongly predicted to have bad time management.


FNR China test:  0.3333333333333333
FNR Others test:  0.25
Student from China have 8% higher probability to be wrongly predicted to have bad time management.


FPR China train:  0.92
FPR Others train:  0.8235294117647058
Student from China have a 10% higher probability to be predicted to be good at time management when they are not.


FPR China test:  0.8461538461538461
FPR Others test:  1.0
Student from other countries are 16% more likely to be predicted to be good at time management when they are not.


In [13]:
## TRAIN DATASET
train_data_m = train_data.loc[train_data["Gender"] == "M"]
train_data_f = train_data.loc[train_data["Gender"] == "F"]

# CALCULATE FNR FOR TRAIN DATASET
FNR_male_train = FalseNeg(train_data_m)
FNR_female_train = FalseNeg(train_data_f)

# CALCULATE FPR FOR TRAIN DATASET
FPR_male_train = FalsePos(train_data_m)
FPR_female_train = FalsePos(train_data_f)

## TEST DATASET
test_data_m = test_data.loc[test_data["Gender"] == "M"]
test_data_f = test_data.loc[test_data["Gender"] == "F"]

# CACULATE FNR FOR TEST DATASET
FNR_male_test = FalseNeg(test_data_m)
FNR_female_test = FalseNeg(test_data_f)

# CACULATE FPR FOR TEST DATASET
FPR_male_test = FalsePos(test_data_m)
FPR_female_test = FalsePos(test_data_f)


print("FNR Males train: ", FNR_male_train)
print("FNR Females train: ", FNR_female_train)
print("Female students have 21% higher probability than male student to be falsely predicted with bad time management.")
print("\n")
print("FNR Males test: ", FNR_male_test)
print("FNR Females test: ", FNR_female_test)
print("Male students have 33% higher probability than female student to be falsely predicted with bad time management.")
print("\n")
print("FPR Males train: ", FPR_male_train)
print("FPR Females train: ", FPR_female_train)
print("Female students have 4% higher probability than male student to be falsely predicted with good time management.")
print("\n")
print("FPR Males test: ", FPR_male_test)
print("FPR Females test: ", FPR_female_test)
print("Female students have 16% higher probability than male student to be falsely predicted with good time management.")

FNR Males train:  0.42857142857142855
FNR Females train:  0.6363636363636364
Female students have 21% higher probability than male student to be falsely predicted with bad time management.


FNR Males test:  0.3333333333333333
FNR Females test:  0.0
Male students have 33% higher probability than female student to be falsely predicted with bad time management.


FPR Males train:  0.8571428571428571
FPR Females train:  0.8928571428571429
Female students have 4% higher probability than male student to be falsely predicted with good time management.


FPR Males test:  0.8461538461538461
FPR Females test:  1.0
Female students have 16% higher probability than male student to be falsely predicted with good time management.
