In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# read the data
raw_data = pd.read_csv("International students Time management data.csv")
pd.set_option("display.max.columns", None)

# transform y - variable 
raw_data = raw_data.fillna(raw_data.mode().iloc[0])
y_column = raw_data[['6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17']]
raw_data['avg_response'] = y_column.mode(axis = 1)[0]
raw_data['label'] = (raw_data['avg_response'] == 'Agree') | (raw_data['avg_response'] == 'Strong Agree')

# # remove rows where response is neutral 
# raw_data = raw_data[raw_data['avg_response'].str.contains("Neither")==False]

In [24]:
display(raw_data['avg_response'].value_counts())

Agree              47
Neither            38
Disagree           32
Strong Disagree     4
Strong Agree        4
Name: avg_response, dtype: int64

In [34]:
# Building blocks for pipeline
impute_and_one_hot = Pipeline([
    ('impute', SimpleImputer(strategy= 'most_frequent')),
    ('encode', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

impute_and_ordinal = Pipeline([
    ('impute', SimpleImputer(strategy= 'most_frequent')),
    ('ordinal', OrdinalEncoder())
])

categorical_prepr = ColumnTransformer([
    ("impute_and_one_hot", impute_and_one_hot, ['Course', 'Academic', 'Attendance'])
#     ("impute_and_ordinal", impute_and_ordinal, ['Academic', 'Attendance'])
])

# create pipeline model
pipe = Pipeline([
    ('features', categorical_prepr),
    ('classifier', LogisticRegression())
])

X_train, X_test, Y_train, Y_test = train_test_split(raw_data, raw_data['label'], random_state = 1)

# fit the pipeline to the training data
pipe.fit(X_train, Y_train)

# predict target values on training data
pred_train = pipe.predict(X_train)

# validate with X
pred_test = pipe.predict(X_test)
score = pipe.score(X_test, Y_test)
print("Train score:", pipe.score(X_train, Y_train))
print("Test score:", score)

Train score: 0.6021505376344086
Test score: 0.65625


In [35]:
print(raw_data['avg_response'].value_counts())

Agree              47
Neither            38
Disagree           32
Strong Disagree     4
Strong Agree        4
Name: avg_response, dtype: int64


In [36]:
pd.set_option("display.max.rows", None)
display(y_column.mode(axis=1))

Unnamed: 0,0,1,2
0,Agree,Disagree,Neither
1,Neither,,
2,Disagree,,
3,Agree,Disagree,Neither
4,Neither,,
5,Strong Agree,,
6,Neither,,
7,Agree,,
8,Disagree,,
9,Strong Disagree,,


In [37]:
display(raw_data)

Unnamed: 0,Number,Age,Gender,Nationality,Program,Course,English,Academic,Attendance,6,7,8,9,10,11,12,13,14,15,16,17,avg_response,label
0,1,31-35,M,Korea,PM,Social Sciences and Humanities,60%~70%,60%~70%,S0,Disagree,Agree,Strong Agree,Neither,Agree,Neither,Disagree,Strong Disagree,Strong Agree,Neither,Disagree,Agree,Agree,True
1,2,26-30,M,China,PM,Science and engineering,60%~70%,50%~59%,S3,Strong Agree,Agree,Neither,Disagree,Agree,Neither,Disagree,Strong Disagree,Neither,Agree,Neither,Disagree,Neither,False
2,3,26-30,M,Kenya,PM,Business,60%~70%,60%~70%,S0,Disagree,Strong Agree,Agree,Disagree,Agree,Agree,Disagree,Strong Disagree,Disagree,Strong Agree,Strong Agree,Disagree,Disagree,False
3,4,21-25,M,Vietnam,PM,Law/Legal studies,60%~70%,60%~70%,S0,Disagree,Disagree,Agree,Agree,Disagree,Agree,Neither,Neither,Neither,Neither,Disagree,Agree,Agree,True
4,5,21-25,M,China,PM,Business,60%~70%,50%~59%,S1,Neither,Disagree,Neither,Neither,Disagree,Neither,Neither,Neither,Neither,Disagree,Neither,Agree,Neither,False
5,6,21-25,M,China,PM,Law/Legal studies,60%~70%,60%~70%,S0,Agree,Neither,Agree,Strong Agree,Disagree,Disagree,Strong Agree,Agree,Strong Agree,Strong Agree,Strong Disagree,Strong Agree,Strong Agree,True
6,7,21-25,M,China,PM,Art and Design,60%~70%,60%~70%,S0,Disagree,Agree,Agree,Neither,Agree,Neither,Neither,Strong Disagree,Disagree,Strong Disagree,Strong Disagree,Neither,Neither,False
7,8,21-25,M,China,PM,Business,60%~70%,60%~70%,S0,Agree,Neither,Neither,Agree,Agree,Neither,Agree,Neither,Agree,Agree,Agree,Agree,Agree,True
8,9,21-25,M,China,PM,Computing,60%~70%,40%~49%,S0,Disagree,Disagree,Strong Disagree,Agree,Disagree,Agree,Agree,Disagree,Agree,Disagree,Disagree,Disagree,Disagree,False
9,10,21-25,M,China,PM,Computing,60%~70%,>70%,S0,Strong Disagree,Agree,Agree,Neither,Disagree,Strong Disagree,Strong Disagree,Strong Disagree,Agree,Agree,Strong Disagree,Strong Disagree,Strong Disagree,False


In [38]:
train_data = pd.concat([X_train.reset_index(), pd.Series(pred_train, name="pred")], axis=1)

train_data['TP'] = (train_data['label'] == True) & (train_data['pred'] == True)
train_data['TN'] = (train_data['label'] == False) & (train_data['pred'] == False)
train_data['FN'] = (train_data['label'] == True) & (train_data['pred'] == False)
train_data['FP'] = (train_data['label'] == False) & (train_data['pred'] == True)

In [30]:
test_data = pd.concat([X_test.reset_index(), pd.Series(pred_test, name="pred")], axis=1)

test_data['TP'] = (test_data['label'] == True) & (test_data['pred'] == True)
test_data['TN'] = (test_data['label'] == False) & (test_data['pred'] == False)
test_data['FN'] = (test_data['label'] == True) & (test_data['pred'] == False)
test_data['FP'] = (test_data['label'] == False) & (test_data['pred'] == True)


In [41]:
test_data['pred'].value_counts() #-> all false
#train_data['pred'].value_counts() -> 1 true rest false


# test_data['label'].value_counts() -> 19:F, 13:T
# train_data['label'].value_counts() -> 38 true and 55 false

False    32
Name: pred, dtype: int64

In [32]:
# False Negative: FN/FN+TP
## The probability that a true positive will be missed by the test
## FN is the number of false negatives 
## TP is the number of true positives (FN+TP being the total number of positives).

# False Positive: FP/FP+TN
## FP is the number of false positives
## TN is the number of true negatives (FP+TN being the total number of negatives)
## It's the probability that a positive result will be given when the true value is negative

In [33]:
# SELECT ROWS IN TRAIN DATASET
train_data_china = train_data.loc[train_data["Nationality"] == "China"]
train_data_others = train_data.loc[train_data["Nationality"] != "China"]

# CALCULATE FNR FOR TRAIN DATASET
FNR_china_train = train_data_china["FN"].sum() / (train_data_china["FN"].sum() + train_data_china["TP"].sum())
FNR_others_train = train_data_others["FN"].sum() / (train_data_others["FN"].sum() + train_data_others["TP"].sum())

# CALCULATE FPR FOR TRAIN DATASET
FPR_china_train = train_data_china["FP"].sum() / (train_data_china["FP"].sum() + train_data_china["TN"].sum())
FPR_others_train = train_data_others["FP"].sum() / (train_data_others["FP"].sum() + train_data_others["TN"].sum())

# SELECT ROWS IN TEST DATASET
test_data_china = test_data.loc[test_data["Nationality"] == "China"]
test_data_others = test_data.loc[test_data["Nationality"] != "China"]

# CALCULATE FNR FOR TEST DATASET
FNR_china_test = test_data_china["FN"].sum() / (test_data_china["FN"].sum() + test_data_china["TP"].sum())
FNR_others_test = test_data_others["FN"].sum() / (test_data_others["FN"].sum() + test_data_others["TP"].sum())

# CALCULATE FPR FOR TEST DATASER
FPR_china_test = test_data_china["FP"].sum() / (test_data_china["FP"].sum() + test_data_china["TN"].sum())
FPR_others_test = test_data_others["FP"].sum() / (test_data_others["FP"].sum() + test_data_others["TN"].sum())

print("FNR China train: ", FNR_china_train)
print("FNR Others train: ", FNR_others_train)
print("Student from China are 10% more likely to be falsely predicted to have bad time management.")
print("\n")
print("FNR China test: ", FNR_china_test)
print("FNR Others test: ",FNR_others_test)
print("Student from other countries have 34% higher probability to be wrongly predicted to have bad time management.")
print("\n")
print("FPR China train: ", FPR_china_train)
print("FPR Others train: ", FPR_others_train)
print("Student from China have a 50% higher probability to be predicted to be good at time management when they are not.")
print("\n")
print("FPR China test: ", FPR_china_test)
print("FPR Others test: ",FPR_others_test)
print("Student from other countries are 12.5% more likely to be predicted to be good at time management when they are not.")

FNR China train:  1.0
FNR Others train:  0.9166666666666666
Student from China are 10% more likely to be falsely predicted to have bad time management.


FNR China test:  1.0
FNR Others test:  1.0
Student from other countries have 34% higher probability to be wrongly predicted to have bad time management.


FPR China train:  0.0
FPR Others train:  0.0
Student from China have a 50% higher probability to be predicted to be good at time management when they are not.


FPR China test:  0.0
FPR Others test:  0.0
Student from other countries are 12.5% more likely to be predicted to be good at time management when they are not.


In [39]:
# SELECT ROWS IN TRAIN DATASET
train_data_m = train_data.loc[train_data["Gender"] == "M"]
train_data_f = train_data.loc[train_data["Gender"] == "F"]

# CALCULATE FNR FOR TRAIN DATASET
FNR_male_train = train_data_m["FN"].sum() / (train_data_m["FN"].sum() + train_data_m["TP"].sum())
FNR_female_train = train_data_f["FN"].sum() / (train_data_f["FN"].sum() + train_data_f["TP"].sum())

# CALCULATE FPR FOR TRAIN DATASET
FPR_male_train = train_data_m["FP"].sum() / (train_data_m["FP"].sum() + train_data_m["TN"].sum())
FPR_female_train = train_data_f["FP"].sum() / (train_data_f["FP"].sum() + train_data_f["TN"].sum())

# SELECT ROWS IN TEST DATASET
test_data_m = test_data.loc[test_data["Gender"] == "M"]
test_data_f = test_data.loc[test_data["Gender"] == "F"]

# CACULATE FNR FOR TEST DATASET
FNR_male_test = test_data_m["FN"].sum() / (test_data_m["FN"].sum() + test_data_m["TP"].sum())
FNR_female_test = test_data_f["FN"].sum() / (test_data_f["FN"].sum() + test_data_f["TP"].sum())

# CACULATE FPR FOR TEST DATASET
FPR_male_test = test_data_m["FP"].sum() / (test_data_m["FP"].sum() + test_data_m["TN"].sum())
FPR_female_test = test_data_f["FP"].sum() / (test_data_f["FP"].sum() + test_data_f["TN"].sum())

print("FNR Males train: ", FNR_male_train)
print("FNR Females train: ", FNR_female_train)
print("Female students have 5% higher probability than male student to be falsely predicted with bad time management.")
print("\n")
print("FNR Males test: ", FNR_male_test)
print("Female students have 9% higher probability than male student to be falsely predicted with bad time management.")
print("FNR Females test: ", FNR_female_test)
print("\n")
print("FPR Males train: ", FPR_male_train)
print("FPR Females train: ", FPR_female_train)
print("Male students have 1% higher probability than female student to be falsely predicted with good time management.")
print("\n")
print("FPR Males test: ", FPR_male_test)
print("FPR Females test: ", FPR_female_test)
print("Female students have 8% higher probability than male student to be falsely predicted with good time management.")

FNR Males train:  0.8
FNR Females train:  0.9444444444444444
Female students have 5% higher probability than male student to be falsely predicted with bad time management.


FNR Males test:  1.0
Female students have 9% higher probability than male student to be falsely predicted with bad time management.
FNR Females test:  1.0


FPR Males train:  0.14285714285714285
FPR Females train:  0.0
Male students have 1% higher probability than female student to be falsely predicted with good time management.


FPR Males test:  0.0
FPR Females test:  0.0
Female students have 8% higher probability than male student to be falsely predicted with good time management.


In [151]:
train_data_others['label'].value_counts()

False    24
True     12
Name: label, dtype: int64

In [152]:
# SELECT ROWS IN TRAIN DATASET
train_data_other = train_data.loc[(train_data["Age"] == "<18") | (train_data["Age"] == "31-25") | (train_data["Age"] == ">36")]
train_data_20s = train_data.loc[(train_data["Age"] == "18-20") | (train_data["Age"] == "21-25") | (train_data["Age"] == "26-30")]

# CALCULATE FNR FOR TRAIN DATASET
FNR_train_other = train_data_other["FN"].sum() / (train_data_other["FN"].sum() + train_data_other["TP"].sum())
FNR_train_20s = train_data_20s["FN"].sum() / (train_data_20s["FN"].sum() + train_data_20s["TP"].sum())

# CALCULATE FPR FOR TRAIN DATASET
FPR_train_other = train_data_other["FP"].sum() / (train_data_other["FP"].sum() + train_data_other["TN"].sum())
FPR_train_20s = train_data_20s["FP"].sum() / (train_data_20s["FP"].sum() + train_data_20s["TN"].sum())


# SELECT ROWS IN TEST DATASET
test_data_other = test_data.loc[(test_data["Age"] == "<18") | (test_data["Age"] == "31-25") | (test_data["Age"] == ">36")]
test_data_20s = test_data.loc[(test_data["Age"] == "18-20") | (test_data["Age"] == "21-25") | (test_data["Age"] == "26-30")]

# CALCULATE FNR FOR TEST DATASET
FNR_test_other = test_data_other["FN"].sum() / (test_data_other["FN"].sum() + test_data_other["TP"].sum())
FNR_test_20s = test_data_20s["FN"].sum() / (test_data_20s["FN"].sum() + test_data_20s["TP"].sum())

# CALCULATE FPR FOR TEST DATASET
FPR_test_other = test_data_other["FP"].sum() / (test_data_other["FP"].sum() + test_data_other["TN"].sum())
FPR_test_20s = test_data_20s["FP"].sum() / (test_data_20s["FP"].sum() + test_data_20s["TN"].sum())


print("FNR Old/Young train: ", FNR_train_other)
print("FNR 20s train: ", FNR_train_20s)
print("Age groups above/below 20-30, are 4% more likely to be falsely predicted to have bad time management.")
print("\n")
print("FNR Old/Young test: ", FNR_test_other)
print("FNR 20s test: ", FNR_test_20s)
print("Age groups above/below 20-30, are 23% more likely to be falsely predicted to have bad time management.")
print("\n")
print("FPR Old/Young train: ", FPR_train_other)
print("FPR 20s train: ", FPR_train_20s)
print("Age groups between 20-30, are 4% more likely to be falsely predicted to have good time management.")
print("\n")
print("FPR Old/Young test: ", FPR_test_other)
print("FPR 20s test: ", FPR_test_20s)
print("Age groups between 20-30, are 4% more likely to be falsely predicted to have good time management.")

FNR Old/Young train:  1.0
FNR 20s train:  0.967741935483871
Age groups above/below 20-30, are 4% more likely to be falsely predicted to have bad time management.


FNR Old/Young test:  nan
FNR 20s test:  1.0
Age groups above/below 20-30, are 23% more likely to be falsely predicted to have bad time management.


FPR Old/Young train:  0.0
FPR 20s train:  0.0
Age groups between 20-30, are 4% more likely to be falsely predicted to have good time management.


FPR Old/Young test:  0.0
FPR 20s test:  0.0
Age groups between 20-30, are 4% more likely to be falsely predicted to have good time management.


  FNR_test_other = test_data_other["FN"].sum() / (test_data_other["FN"].sum() + test_data_other["TP"].sum())


In [156]:
# https://towardsdatascience.com/confusion-matrix-what-is-it-e859e1bbecdc
from sklearn.metrics import confusion_matrix

actual = train_data_china['label']
predicted = train_data_china['pred']

conf_mat = confusion_matrix(actual, predicted) 

print(conf_mat)

[[31  0]
 [26  0]]
