In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# read the data
raw_data = pd.read_csv("International students Time management data.csv")
pd.set_option("display.max.columns", None)

impute_and_one_hot = Pipeline([
    ('impute', SimpleImputer(strategy= 'most_frequent')),
    ('encode', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

impute_and_ordinal = Pipeline([
    ('impute', SimpleImputer(strategy= 'most_frequent')),
    ('ordinal', OrdinalEncoder())
])

categorical_prepr = ColumnTransformer([
    ("impute_and_one_hot", impute_and_one_hot, ['Course']),
    ("impute_and_ordinal", impute_and_ordinal, ['Academic', 'Attendance'])
])

# transform y variable
raw_data['label'] = (raw_data['7'] == 'Agree') | (raw_data['7'] == 'Strong Agree')

# create pipeline model
pipe = Pipeline([
    ('features', categorical_prepr),
    ('classifier', LogisticRegression())
])

X_train, X_test, Y_train, Y_test = train_test_split(raw_data, raw_data['label'], random_state=1)

# fit the pipeline to the training data
pipe.fit(X_train, Y_train)

# predict target values on training data
pred_train = pipe.predict(X_train)

# validate with X
pred_test = pipe.predict(X_test)
score = pipe.score(X_test, Y_test)
print("Train score:", pipe.score(X_train, Y_train))
print("Test score:", score)

Train score: 0.6451612903225806
Test score: 0.71875


In [52]:
train_data = pd.concat([X_train.reset_index(), pd.Series(pred_train, name="pred")], axis=1)

train_data['TP'] = (train_data['label'] == True) & (train_data['pred'] == True)
train_data['TN'] = (train_data['label'] == False) & (train_data['pred'] == False)
train_data['FN'] = (train_data['label'] == True) & (train_data['pred'] == False)
train_data['FP'] = (train_data['label'] == False) & (train_data['pred'] == True)

In [53]:
test_data = pd.concat([X_test.reset_index(), pd.Series(pred_test, name="pred")], axis=1)

test_data['TP'] = (test_data['label'] == True) & (test_data['pred'] == True)
test_data['TN'] = (test_data['label'] == False) & (test_data['pred'] == False)
test_data['FN'] = (test_data['label'] == True) & (test_data['pred'] == False)
test_data['FP'] = (test_data['label'] == False) & (test_data['pred'] == True)


In [80]:
# False Negative: FN/FN+TP
## The probability that a true positive will be missed by the test
## FN is the number of false negatives 
## TP is the number of true positives (FN+TP being the total number of positives).

# False Positive: FP/FP+TN
## FP is the number of false positives
## TN is the number of true negatives (FP+TN being the total number of negatives)
## It's the probability that a positive result will be given when the true value is negative

In [81]:
# SELECT ROWS IN TRAIN DATASET
train_data_china = train_data.loc[train_data["Nationality"] == "China"]
train_data_others = train_data.loc[train_data["Nationality"] != "China"]

# CALCULATE FNR FOR TRAIN DATASET
FNR_china_train = train_data_china["FN"].sum() / (train_data_china["FN"].sum() + train_data_china["TP"].sum())
FNR_others_train = train_data_others["FN"].sum() / (train_data_others["FN"].sum() + train_data_others["TP"].sum())

# CALCULATE FPR FOR TRAIN DATASET
FPR_china_train = train_data_china["FP"].sum() / (train_data_china["FP"].sum() + train_data_china["TN"].sum())
FPR_others_train = train_data_others["FP"].sum() / (train_data_others["FP"].sum() + train_data_others["TN"].sum())

# SELECT ROWS IN TEST DATASET
test_data_china = test_data.loc[test_data["Nationality"] == "China"]
test_data_others = test_data.loc[test_data["Nationality"] != "China"]

# CALCULATE FNR FOR TEST DATASET
FNR_china_test = test_data_china["FN"].sum() / (test_data_china["FN"].sum() + test_data_china["TP"].sum())
FNR_others_test = test_data_others["FN"].sum() / (test_data_others["FN"].sum() + test_data_others["TP"].sum())

# CALCULATE FPR FOR TEST DATASER
FPR_china_test = test_data_china["FP"].sum() / (test_data_china["FP"].sum() + test_data_china["TN"].sum())
FPR_others_test = test_data_others["FP"].sum() / (test_data_others["FP"].sum() + test_data_others["TN"].sum())

print("FNR China train: ", FNR_china_train)
print("FNR Others train: ", FNR_others_train)
print("\n")
print("FNR China test: ", FNR_china_test)
print("FNR Others test: ",FNR_others_test)
print("\n")
print("FPR China train: ", FPR_china_train)
print("FPR Others train: ", FPR_others_train)
print("\n")
print("FPR China test: ", FPR_china_test)
print("FPR Others test: ",FPR_others_test)

FNR China train:  1.0
FNR Others train:  0.9166666666666666


FNR China test:  0.6666666666666666
FNR Others test:  1.0


FPR China train:  0.05405405405405406
FPR Others train:  0.0


FPR China test:  0.0
FPR Others test:  0.125


In [83]:
# SELECT ROWS IN TRAIN DATASET
train_data_m = train_data.loc[train_data["Gender"] == "M"]
train_data_f = train_data.loc[train_data["Gender"] == "F"]

# CALCULATE FNR FOR TRAIN DATASET
FNR_male_train = train_data_m["FN"].sum() / (train_data_m["FN"].sum() + train_data_m["TP"].sum())
FNR_female_train = train_data_f["FN"].sum() / (train_data_f["FN"].sum() + train_data_f["TP"].sum())

# CALCULATE FPR FOR TRAIN DATASET
FPR_male_train = train_data_m["FP"].sum() / (train_data_m["FP"].sum() + train_data_m["TN"].sum())
FPR_female_train = train_data_f["FP"].sum() / (train_data_f["FP"].sum() + train_data_f["TN"].sum())

# SELECT ROWS IN TEST DATASET
test_data_m = test_data.loc[test_data["Gender"] == "M"]
test_data_f = test_data.loc[test_data["Gender"] == "F"]

# CACULATE FNR FOR TEST DATASET
FNR_male_test = test_data_m["FN"].sum() / (test_data_m["FN"].sum() + test_data_m["TP"].sum())
FNR_female_test = test_data_f["FN"].sum() / (test_data_f["FN"].sum() + test_data_f["TP"].sum())

# CACULATE FPR FOR TEST DATASET
FPR_male_test = test_data_m["FP"].sum() / (test_data_m["FP"].sum() + test_data_m["TN"].sum())
FPR_female_test = test_data_f["FP"].sum() / (test_data_f["FP"].sum() + test_data_f["TN"].sum())

print("FNR Males train: ", FNR_male_train)
print("FNR Females train: ", FNR_female_train)
print("\n")
print("FNR Males test: ", FNR_male_test)
print("FNR Females test: ", FNR_female_test)
print("\n")
print("FPR Males train: ", FPR_male_train)
print("FPR Females train: ", FPR_female_train)
print("\n")
print("FPR Males test: ", FPR_male_test)
print("FPR Females test: ", FPR_female_test)

FNR Males train:  0.9523809523809523
FNR Females train:  1.0


FNR Males test:  0.75
FNR Females test:  0.8333333333333334


FPR Males train:  0.037037037037037035
FPR Females train: %.2f  0.029411764705882353


FPR Males test:  0.0
FPR Females test:  0.08333333333333333


In [86]:
# SELECT ROWS IN TRAIN DATASET
train_data_young = train_data.loc[train_data["Age"] == "<18"]
train_data_20s = train_data.loc[(train_data["Age"] == "18-20") | (train_data["Age"] == "21-25") | (train_data["Age"] == "26-30")]
train_data_older = train_data.loc[(train_data["Age"] == "31-25") | (train_data["Age"] == ">36")]

# CALCULATE FNR FOR TRAIN DATASET
FNR_train_young = train_data_young["FN"].sum() / (train_data_young["FN"].sum() + train_data_young["TP"].sum())
FNR_train_20s = train_data_20s["FN"].sum() / (train_data_20s["FN"].sum() + train_data_20s["TP"].sum())
FNR_train_older = train_data_older["FN"].sum() / (train_data_older["FN"].sum() + train_data_older["TP"].sum())

# CALCULATE FPR FOR TRAIN DATASET
FPR_train_young = train_data_young["FP"].sum() / (train_data_young["FP"].sum() + train_data_young["TN"].sum())
FPR_train_20s = train_data_20s["FP"].sum() / (train_data_20s["FP"].sum() + train_data_20s["TN"].sum())
FPR_train_older = train_data_older["FP"].sum() / (train_data_older["FP"].sum() + train_data_older["TN"].sum())

# SELECT ROWS IN TEST DATASET
test_data_young = test_data.loc[test_data["Age"] == "<18"]
test_data_20s = test_data.loc[(test_data["Age"] == "18-20") | (test_data["Age"] == "21-25") | (test_data["Age"] == "26-30")]
test_data_older = test_data.loc[(test_data["Age"] == "31-25") | (test_data["Age"] == ">36")]

# CALCULATE FNR FOR TEST DATASET
FNR_test_young = test_data_young["FN"].sum() / (test_data_young["FN"].sum() + test_data_young["TP"].sum())
FNR_test_20s = test_data_20s["FN"].sum() / (test_data_20s["FN"].sum() + test_data_20s["TP"].sum())
FNR_test_older = test_data_older["FN"].sum() / (test_data_older["FN"].sum() + test_data_older["TP"].sum())


# CALCULATE FPR FOR TEST DATASET
FPR_test_young = test_data_young["FP"].sum() / (test_data_young["FP"].sum() + test_data_young["TN"].sum())
FPR_test_20s = test_data_20s["FP"].sum() / (test_data_20s["FP"].sum() + test_data_20s["TN"].sum())
FPR_test_older = test_data_older["FP"].sum() / (test_data_older["FP"].sum() + test_data_older["TN"].sum())


print("FNR Young train: ", FNR_train_young)
print("FNR 20s train: ", FNR_train_20s)
print("FNR older train: ", FNR_train_older)
print("\n")
print("FNR Young test: ", FNR_test_young)
print("FNR 20s test: ", FNR_test_20s)
print("FNR older test: ", FNR_test_older)
print("\n")
print("FPR Young train: ", FPR_train_young)
print("FPR 20s train: ", FPR_train_20s)
print("FPR older train: ", FPR_train_older)
print("\n")
print("FPR Young test: ", FPR_test_young)
print("FPR 20s test: ", FPR_test_20s)
print("FPR older test: ", FPR_test_older)

FNR Young train:  1.0
FNR 20s train:  0.9615384615384616
FNR older train:  1.0


FNR Young test:  1.0
FNR 20s test:  0.7777777777777778
FNR older test:  nan


FPR Young train:  0.0
FPR 20s train:  0.037037037037037035
FPR older train:  0.0


FPR Young test:  0.0
FPR 20s test:  0.047619047619047616
FPR older test:  nan


  FNR_test_older = test_data_older["FN"].sum() / (test_data_older["FN"].sum() + test_data_older["TP"].sum())
  FPR_test_older = test_data_older["FP"].sum() / (test_data_older["FP"].sum() + test_data_older["TN"].sum())
