In [123]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# read the data
raw_data = pd.read_csv("International students Time management data.csv")

# transform y - variable 
raw_data = raw_data.fillna(raw_data.mode().iloc[0])
y_column = raw_data[['7','10', '11', '12', '14', '17']]
raw_data['avg_response'] = y_column.mode(axis = 1)[0]
# raw_data = raw_data[raw_data['avg_response'] != 'Neither'] 
raw_data['label'] = (raw_data['avg_response'] == 'Agree') | (raw_data['avg_response'] == 'Strong Agree')


In [124]:
# Building blocks for pipeline
impute_and_one_hot = Pipeline([
    ('impute', SimpleImputer(strategy= 'most_frequent')),
    ('encode', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

impute_and_ordinal = Pipeline([
    ('impute', SimpleImputer(strategy= 'most_frequent')),
    ('ordinal', OrdinalEncoder())
])

categorical_prepr = ColumnTransformer([
    ("impute_and_one_hot", impute_and_one_hot, ['Course']),
    ("impute_and_ordinal", impute_and_ordinal, ['Academic', 'Attendance'])
])

# create pipeline model
pipe = Pipeline([
    ('features', categorical_prepr),
    ('classifier', LogisticRegression())
])

X_train, X_test, Y_train, Y_test = train_test_split(raw_data, raw_data['label'], random_state = 1)

# fit the pipeline to the training data
pipe.fit(X_train, Y_train)

# predict target values on training data
pred_train = pipe.predict(X_train)

# validate with X
pred_test = pipe.predict(X_test)
score = pipe.score(X_test, Y_test)
print("Train score:", pipe.score(X_train, Y_train))
print("Test score:", score)

Train score: 0.6236559139784946
Test score: 0.6875


In [125]:
train_data = pd.concat([X_train.reset_index(), pd.Series(pred_train, name="pred")], axis=1)

train_data['TP']df.plot(x=df['a'], y=df['b'], label=df['c'])  = (train_data['label'] == True) & (train_data['pred'] == True)
train_data['TN'] = (train_data['label'] == False) & (train_data['pred'] == False)
train_data['FN'] = (train_data['label'] == True) & (train_data['pred'] == False)
train_data['FP'] = (train_data['label'] == False) & (train_data['pred'] == True)

In [None]:
train_data.plot(x=df['a'], y=df['b'], label=df['c']) 

In [127]:
actual = train_data['label']
predicted = train_data['pred']

conf_mat = confusion_matrix(actual, predicted) 

print(conf_mat)

[[45  8]
 [27 13]]


In [128]:
display(train_data.shape)

(93, 29)

In [129]:
test_data = pd.concat([X_test.reset_index(), pd.Series(pred_test, name="pred")], axis=1)

test_data['TP'] = (test_data['label'] == True) & (test_data['pred'] == True)
test_data['TN'] = (test_data['label'] == False) & (test_data['pred'] == False)
test_data['FN'] = (test_data['label'] == True) & (test_data['pred'] == False)
test_data['FP'] = (test_data['label'] == False) & (test_data['pred'] == True)


In [130]:
actual = test_data['label']
predicted = test_data['pred']

conf_mat = confusion_matrix(actual, predicted) 

print(conf_mat)

[[16  4]
 [ 6  6]]


In [131]:
display(test_data.shape)

(32, 29)

In [133]:
# False Negative: FN/FN+TP
## The probability that a true positive will be missed by the test
## FN is the number of false negatives 
## TP is the number of true positives (FN+TP being the total number of positives).

# False Positive: FP/FP+TN
## FP is the number of false positives
## TN is the number of true negatives (FP+TN being the total number of negatives)
## It's the probability that a positive result will be given when the true value is negative

In [135]:
# SELECT ROWS IN TRAIN DATASET
train_data_china = train_data.loc[train_data["Nationality"] == "China"]
train_data_others = train_data.loc[train_data["Nationality"] != "China"]

# CALCULATE FNR FOR TRAIN DATASET
FNR_china_train = train_data_china["FN"].sum() / (train_data_china["FN"].sum() + train_data_china["TP"].sum())
FNR_others_train = train_data_others["FN"].sum() / (train_data_others["FN"].sum() + train_data_others["TP"].sum())

# CALCULATE FPR FOR TRAIN DATASET
FPR_china_train = train_data_china["FP"].sum() / (train_data_china["FP"].sum() + train_data_china["TN"].sum())
FPR_others_train = train_data_others["FP"].sum() / (train_data_others["FP"].sum() + train_data_others["TN"].sum())

# SELECT ROWS IN TEST DATASET
test_data_china = test_data.loc[test_data["Nationality"] == "China"]
test_data_others = test_data.loc[test_data["Nationality"] != "China"]

# CALCULATE FNR FOR TEST DATASET
FNR_china_test = test_data_china["FN"].sum() / (test_data_china["FN"].sum() + test_data_china["TP"].sum())
FNR_others_test = test_data_others["FN"].sum() / (test_data_others["FN"].sum() + test_data_others["TP"].sum())

# CALCULATE FPR FOR TEST DATASER
FPR_china_test = test_data_china["FP"].sum() / (test_data_china["FP"].sum() + test_data_china["TN"].sum())
FPR_others_test = test_data_others["FP"].sum() / (test_data_others["FP"].sum() + test_data_others["TN"].sum())

print("FNR China train: ", FNR_china_train)
print("FNR Others train: ", FNR_others_train)
print("Student from China have a 5% higher chance to be falsely predicted to have bad time management")
print("\n")
print("FNR China test: ", FNR_china_test)
print("FNR Others test: ",FNR_others_test)
print("Student from other countries have 37.5% higher probability to be wrongly predicted to have bad time management.")
print("\n")
print("FPR China train: ", FPR_china_train)
print("FPR Others train: ", FPR_others_train)
print("Student from China have a 10% higher probability to be predicted to be good at time management when they are not.")
print("\n")
print("FPR China test: ", FPR_china_test)
print("FPR Others test: ",FPR_others_test)
print("Student from other countries are 29.5% more likely to be predicted to be good at time management when they are not.")

FNR China train:  0.6923076923076923
FNR Others train:  0.6428571428571429
Student from China have a 5% higher chance to be falsely predicted to have bad time management


FNR China test:  0.375
FNR Others test:  0.75
Student from other countries have 37.5% higher probability to be wrongly predicted to have bad time management.


FPR China train:  0.1935483870967742
FPR Others train:  0.09090909090909091
Student from China have a 10% higher probability to be predicted to be good at time management when they are not.


FPR China test:  0.08333333333333333
FPR Others test:  0.375
Student from other countries are 29.5% more likely to be predicted to be good at time management when they are not.


In [136]:
# SELECT ROWS IN TRAIN DATASET
train_data_m = train_data.loc[train_data["Gender"] == "M"]
train_data_f = train_data.loc[train_data["Gender"] == "F"]

# CALCULATE FNR FOR TRAIN DATASET
FNR_male_train = train_data_m["FN"].sum() / (train_data_m["FN"].sum() + train_data_m["TP"].sum())
FNR_female_train = train_data_f["FN"].sum() / (train_data_f["FN"].sum() + train_data_f["TP"].sum())

# CALCULATE FPR FOR TRAIN DATASET
FPR_male_train = train_data_m["FP"].sum() / (train_data_m["FP"].sum() + train_data_m["TN"].sum())
FPR_female_train = train_data_f["FP"].sum() / (train_data_f["FP"].sum() + train_data_f["TN"].sum())

# SELECT ROWS IN TEST DATASET
test_data_m = test_data.loc[test_data["Gender"] == "M"]
test_data_f = test_data.loc[test_data["Gender"] == "F"]

# CACULATE FNR FOR TEST DATASET
FNR_male_test = test_data_m["FN"].sum() / (test_data_m["FN"].sum() + test_data_m["TP"].sum())
FNR_female_test = test_data_f["FN"].sum() / (test_data_f["FN"].sum() + test_data_f["TP"].sum())

# CACULATE FPR FOR TEST DATASET
FPR_male_test = test_data_m["FP"].sum() / (test_data_m["FP"].sum() + test_data_m["TN"].sum())
FPR_female_test = test_data_f["FP"].sum() / (test_data_f["FP"].sum() + test_data_f["TN"].sum())

print("FNR Males train: ", FNR_male_train)
print("FNR Females train: ", FNR_female_train)
print("Male students have 20% higher probability than female student to be falsely predicted with bad time management.")
print("\n")
print("FNR Males test: ", FNR_male_test)
print("FNR Females test: ", FNR_female_test)
print("There is no difference between male and female students.")
print("\n")
print("FPR Males train: ", FPR_male_train)
print("FPR Females train: ", FPR_female_train)
print("Male students have 38% higher probability than female student to be falsely predicted with good time management.")
print("\n")
print("FPR Males test: ", FPR_male_test)
print("FPR Females test: ", FPR_female_test)
print("Female students have 4% higher probability than male student to be falsely predicted with good time management.")

FNR Males train:  0.7727272727272727
FNR Females train:  0.5555555555555556
Male students have 20% higher probability than female student to be falsely predicted with bad time management.


FNR Males test:  1.0
FNR Females test:  0.4
There is no difference between male and female students.


FPR Males train:  0.15384615384615385
FPR Females train:  0.14814814814814814
Male students have 38% higher probability than female student to be falsely predicted with good time management.


FPR Males test:  0.25
FPR Females test:  0.125
Female students have 4% higher probability than male student to be falsely predicted with good time management.


In [138]:
# SELECT ROWS IN TRAIN DATASET
train_data_20s = train_data.loc[(train_data["Age"] == "18-20") | (train_data["Age"] == "21-25") | (train_data["Age"] == "26-30")]
train_data_other = train_data.loc[(train_data["Age"] == "<18") | (train_data["Age"] == "31-25") | (train_data["Age"] == ">36")]

# CALCULATE FNR FOR TRAIN DATASET
FNR_train_other = train_data_other["FN"].sum() / (train_data_other["FN"].sum() + train_data_other["TP"].sum())
FNR_train_20s = train_data_20s["FN"].sum() / (train_data_20s["FN"].sum() + train_data_20s["TP"].sum())

# CALCULATE FPR FOR TRAIN DATASET
FPR_train_other = train_data_other["FP"].sum() / (train_data_other["FP"].sum() + train_data_other["TN"].sum())
FPR_train_20s = train_data_20s["FP"].sum() / (train_data_20s["FP"].sum() + train_data_20s["TN"].sum())


# SELECT ROWS IN TEST DATASET
test_data_other = test_data.loc[(test_data["Age"] == "<18") | (test_data["Age"] == "31-25") | (test_data["Age"] == ">36")]
test_data_20s = test_data.loc[(test_data["Age"] == "18-20") | (test_data["Age"] == "21-25") | (test_data["Age"] == "26-30")]

# CALCULATE FNR FOR TEST DATASET
FNR_test_other = test_data_other["FN"].sum() / (test_data_other["FN"].sum() + test_data_other["TP"].sum())
FNR_test_20s = test_data_20s["FN"].sum() / (test_data_20s["FN"].sum() + test_data_20s["TP"].sum())

# CALCULATE FPR FOR TEST DATASET
FPR_test_other = test_data_other["FP"].sum() / (test_data_other["FP"].sum() + test_data_other["TN"].sum())
FPR_test_20s = test_data_20s["FP"].sum() / (test_data_20s["FP"].sum() + test_data_20s["TN"].sum())


print("FNR Old/Young train: ", FNR_train_other)
print("FNR 20s train: ", FNR_train_20s)
print("Age groups above/below 20-30, are 10% less likely to be falsely predicted to have bad time management.")
print("\n")
print("FNR Old/Young test: ", FNR_test_other)
print("FNR 20s test: ", FNR_test_20s)
print("Age groups above/below 20-30, are 50% less likely to be falsely predicted to have bad time management.")
print("\n")
print("FPR Old/Young train: ", FPR_train_other)
print("FPR 20s train: ", FPR_train_20s)
print("Age groups between 20-30, are 1% less likely to be falsely predicted to have good time management.")
print("\n")
print("FPR Old/Young test: ", FPR_test_other)
print("FPR 20s test: ", FPR_test_20s)
print("Age groups between 20-30, are 22% more likely to be falsely predicted to have good time management.")

# runtime error because null division

FNR Old/Young train:  0.5
FNR 20s train:  0.6571428571428571
Age groups above/below 20-30, are 10% less likely to be falsely predicted to have bad time management.


FNR Old/Young test:  nan
FNR 20s test:  0.5
Age groups above/below 20-30, are 50% less likely to be falsely predicted to have bad time management.


FPR Old/Young train:  0.16666666666666666
FPR 20s train:  0.15555555555555556
Age groups between 20-30, are 1% less likely to be falsely predicted to have good time management.


FPR Old/Young test:  0.0
FPR 20s test:  0.2222222222222222
Age groups between 20-30, are 22% more likely to be falsely predicted to have good time management.


  FNR_test_other = test_data_other["FN"].sum() / (test_data_other["FN"].sum() + test_data_other["TP"].sum())
