In [118]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, label_binarize
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix

# random seed for reproducibility
seed = 123456789
np.random.seed(seed)

raw_data = pd.read_csv("International students Time management data.csv")

#Question 6: You often feel that your life is aimless, with no definite purpose
target_column = '6'
raw_data = raw_data[raw_data[target_column] != 'Neither']
raw_data = raw_data[raw_data[target_column].notna()]
raw_data[target_column] = raw_data[target_column].replace('Strong Agree', 'Agree')
raw_data[target_column] = raw_data[target_column].replace('Strong Disagree', 'Disagree')

In [119]:
impute_and_one_hot = Pipeline([
    ('impute', SimpleImputer(strategy= 'most_frequent')),
    ('encode', OneHotEncoder(handle_unknown='ignore'))
])

answers = ['Strong Disagree', 'Disagree', 'Neither', 'Agree', 'Strong Agree']
percentages = ['<40%', '40%~49%', '50%~59%', '60%~70%','>70%']

impute_and_ordinal = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[percentages,
                                           ['S4', 'S3', 'S2', 'S1', 'S0'], 
                                           percentages,
                                           ['<18', '18-20', '21-25', '26-30', '31-35', '>36'],
                                           answers, answers, answers, answers])),
    ('scaler', StandardScaler())
])

feature_encoding = ColumnTransformer([
    ("impute_and_one_hot", impute_and_one_hot, ['Course']),
    ("impute_and_ordinal", impute_and_ordinal, ['Academic', 'Attendance', 'English', 'Age',  
                                                '8', '12', '14', '15'])
])

pipeline = Pipeline([
    ('features', feature_encoding),
    ('sgdclassifier', SGDClassifier(loss='log'))
])

In [120]:
hyperparam_grid = {
    "sgdclassifier__alpha" : [0.01, 0.03, 0.1, 0.3],
    "sgdclassifier__penalty" : ["l2", "l1", "elasticnet"],
    "sgdclassifier__eta0": [0.01, 0.03, 0.1, 0.3]
}

grid_search = GridSearchCV(pipeline, param_grid=hyperparam_grid, cv=3)

In [125]:
X_train, X_test, y_train_raw, y_test_raw = \
  train_test_split(raw_data, raw_data[target_column], test_size=.3, random_state=seed)

y_train = np.squeeze(label_binarize(y_train_raw, classes=['Agree', 'Disagree']))
y_test = np.squeeze(label_binarize(y_test_raw, classes=['Agree', 'Disagree']))

model = grid_search.fit(X_train, y_train)

In [126]:
model.score(X_test, y_test)

0.7931034482758621

In [127]:
confusion_matrix(y_test, model.predict(X_test))

array([[ 4,  3],
       [ 3, 19]])