In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# read the data
raw_data = pd.read_csv("International students Time management data.csv")
pd.set_option("display.max.columns", None)

impute_and_one_hot = Pipeline([
    ('impute', SimpleImputer(strategy= 'most_frequent')),
    ('encode', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

impute_and_ordinal = Pipeline([
    ('impute', SimpleImputer(strategy= 'most_frequent')),
    ('ordinal', OrdinalEncoder())
])

categorical_prepr = ColumnTransformer([
    ("impute_and_one_hot", impute_and_one_hot, ['Course']),
    ("impute_and_ordinal", impute_and_ordinal, ['Academic', 'Attendance'])
])

# transform y variable
raw_data['label'] = (raw_data['7'] == 'Agree') | (raw_data['7'] == 'Strong Agree')

# create pipeline model
pipe = Pipeline([
    ('features', categorical_prepr),
    ('classifier', LogisticRegression())
])

X_train, X_test, Y_train, Y_test = train_test_split(raw_data, raw_data['label'], random_state=1)

# fit the pipeline to the training data
pipe.fit(X_train, Y_train)

# predict target values on training data
pred_train = pipe.predict(X_train)

# validate with X
pred_test = pipe.predict(X_test)
score = pipe.score(X_test, Y_test)
print("Train score:", pipe.score(X_train, Y_train))
print("Test score:", score)

Train score: 0.6451612903225806
Test score: 0.71875


In [2]:
print("Raw data shape:", raw_data.shape)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)

Raw data shape: (125, 22)
X_train shape: (93, 22)
X_test shape: (32, 22)
Y_train shape: (93,)
Y_test shape: (32,)


In [3]:
X_test.head()

Unnamed: 0,Number,Age,Gender,Nationality,Program,Course,English,Academic,Attendance,6,7,8,9,10,11,12,13,14,15,16,17,label
74,75,18-20,F,Vietnam,IYO,Business,,>70%,S0,Neither,Disagree,Strong Agree,Agree,Disagree,Neither,Neither,Disagree,Disagree,Neither,Agree,Neither,False
45,46,21-25,M,China,PM,Business,60%~70%,50%~59%,S0,Disagree,Disagree,Agree,Agree,Disagree,Disagree,Disagree,Disagree,Disagree,Disagree,Disagree,Agree,False
48,49,21-25,M,China,PM,Business,50%~59%,40%~49%,S3,Neither,Neither,Neither,Disagree,Agree,Neither,Strong Agree,Neither,Disagree,Neither,Neither,Disagree,False
31,32,18-20,M,China,FC,Business,50%~59%,,S0,Neither,Disagree,Agree,Disagree,Neither,Agree,Disagree,Agree,Neither,Strong Agree,Agree,Strong Agree,False
65,66,26-30,F,China,PM,Business,>70%,,S0,Disagree,Agree,Agree,Neither,Agree,Disagree,Neither,Strong Disagree,Agree,Disagree,Disagree,Agree,True


In [4]:
X_train.head()

Unnamed: 0,Number,Age,Gender,Nationality,Program,Course,English,Academic,Attendance,6,7,8,9,10,11,12,13,14,15,16,17,label
42,43,31-35,M,China,PM,Business,50%~59%,50%~59%,S2,Strong Agree,Strong Agree,Strong Agree,Strong Agree,Strong Agree,Strong Agree,Agree,Agree,Strong Agree,Strong Agree,Strong Agree,Strong Agree,True
54,55,21-25,M,China,PM,Business,50%~59%,60%~70%,S1,Agree,Neither,Agree,Neither,Disagree,Disagree,Neither,Neither,Agree,Agree,Neither,Agree,False
102,103,21-25,F,China,PM,Business,50%~59%,50%~59%,S1,Disagree,Neither,,Neither,Neither,Neither,Neither,Strong Disagree,Neither,Neither,Neither,Neither,False
117,118,21-25,M,China,PM,Business,60%~70%,50%~59%,S0,Disagree,Disagree,Agree,Agree,Agree,Agree,Neither,Strong Disagree,Agree,Agree,Strong Agree,Strong Agree,False
55,56,31-35,M,China,PM,Business,50%~59%,50%~59%,S0,Disagree,Neither,Disagree,Disagree,Neither,Agree,Disagree,Disagree,Agree,Neither,Agree,Neither,False


In [5]:
pred_test

array([False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False,  True])

In [6]:
Y_test

74     False
45     False
48     False
31     False
65      True
66      True
118    False
100     True
115    False
33     False
62     False
44     False
105     True
104    False
119    False
106    False
35     False
103    False
122    False
2       True
53      True
123    False
110    False
56     False
90      True
51      True
83     False
17     False
95      True
78     False
46      True
69     False
Name: label, dtype: bool

In [8]:
train_data = pd.concat([X_train, pd.Series(pred_train, name="pred")], axis=1)

In [13]:
train_data.head()

Unnamed: 0,Number,Age,Gender,Nationality,Program,Course,English,Academic,Attendance,6,7,8,9,10,11,12,13,14,15,16,17,label,pred
0,1.0,31-35,M,Korea,PM,Social Sciences and Humanities,60%~70%,,S0,Disagree,Agree,Strong Agree,Neither,Agree,Neither,Disagree,Strong Disagree,Strong Agree,Neither,Disagree,Agree,True,False
1,2.0,26-30,M,China,PM,Science and engineering,60%~70%,50%~59%,S3,Strong Agree,Agree,Neither,Disagree,Agree,Neither,Disagree,Strong Disagree,Neither,Agree,Neither,Disagree,True,False
2,,,,,,,,,,,,,,,,,,,,,,,False
3,4.0,21-25,M,Vietnam,PM,Law/Legal studies,60%~70%,60%~70%,S0,Disagree,Disagree,Agree,Agree,Disagree,Agree,Neither,Neither,Neither,Neither,Disagree,Agree,False,False
4,5.0,21-25,M,China,PM,Business,60%~70%,50%~59%,S1,Neither,Disagree,Neither,Neither,Disagree,Neither,Neither,Neither,Neither,Disagree,Neither,Agree,False,False


In [24]:
train_data[["Nationality", "label", "pred"]].value_counts()

Nationality   label  pred 
China         False  False    26
              True   False    16
Oman          False  False     3
              True   False     2
Vietnam       False  False     2
Saudi Arabia  False  False     2
Kuwait        False  False     2
Turkey        True   False     1
Thailand      False  False     1
Tanzania      False  False     1
Singapore     False  False     1
Saudi Arabia  True   False     1
Qatar         False  False     1
Portugal      True   False     1
Pakistan      False  False     1
OCEAN         False  False     1
China         False  True      1
Malaysia      False  False     1
Korea         True   False     1
              False  False     1
Jordan        False  False     1
Indonesia     True   False     1
India         True   True      1
                     False     1
              False  False     1
Georgia       True   False     1
Zimbabwe      False  False     1
dtype: int64

In [10]:
test_data = pd.concat([X_test, pd.Series(pred_test, name="pred")], axis=1)

In [11]:
test_data.head(10)

Unnamed: 0,Number,Age,Gender,Nationality,Program,Course,English,Academic,Attendance,6,7,8,9,10,11,12,13,14,15,16,17,label,pred
0,,,,,,,,,,,,,,,,,,,,,,,False
1,,,,,,,,,,,,,,,,,,,,,,,False
2,3.0,26-30,M,Kenya,PM,Business,60%~70%,,S0,Disagree,Strong Agree,Agree,Disagree,Agree,Agree,Disagree,Strong Disagree,Disagree,Strong Agree,Strong Agree,Disagree,True,False
3,,,,,,,,,,,,,,,,,,,,,,,False
4,,,,,,,,,,,,,,,,,,,,,,,False
5,,,,,,,,,,,,,,,,,,,,,,,False
6,,,,,,,,,,,,,,,,,,,,,,,False
7,,,,,,,,,,,,,,,,,,,,,,,True
8,,,,,,,,,,,,,,,,,,,,,,,False
9,,,,,,,,,,,,,,,,,,,,,,,False
