In [94]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report

# read the data
raw_data = pd.read_csv("International students Time management data.csv")

# transform y - variable 
raw_data = raw_data.fillna(raw_data.mode().iloc[0])

y_column = raw_data[['7','10', '11', '12', '14', '17']]
y_column = y_column.replace('Strong Agree','Agree')
y_column = y_column.replace('Strong Disagree','Disagree')
# y_column = y_column.replace('Neither','Disagree')

raw_data['avg_response'] = y_column.mode(axis = 1)[0]

# DROP NEITHER
raw_data = raw_data.loc[raw_data['avg_response'] != "Neither"]
raw_data['label'] = (raw_data['avg_response'] == 'Agree')

# Building blocks for pipeline
impute_and_one_hot = Pipeline([
    ('impute', SimpleImputer(strategy= 'most_frequent')),
    ('encode', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

impute_and_ordinal = Pipeline([
    ('impute', SimpleImputer(strategy= 'most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[['<40%', '40%~49%', '50%~59%', '60%~70%','>70%'], ['S4', 'S3', 'S2', 'S1', 'S0']])),
    ('scalar', StandardScaler())
])

categorical_prepr = ColumnTransformer([
    ("impute_and_one_hot", impute_and_one_hot, ['Course']),
    ("impute_and_ordinal", impute_and_ordinal, ['Academic', 'Attendance'])
])

# Set grid search params
params = {
    "sgdclassifier__loss" : ["hinge", "log", "squared_hinge", "modified_huber"],
    "sgdclassifier__alpha" : [0.0001, 0.001, 0.01, 0.1],
    "sgdclassifier__penalty" : ["l2", "l1", "none"],
}

# create pipeline model
pipe = Pipeline([
    ('features', categorical_prepr),
    ('sgdclassifier', SGDClassifier())
])

grid = GridSearchCV(pipe, n_jobs=-1, param_grid = params)

X_train, X_test, Y_train, Y_test = train_test_split(raw_data, raw_data['label'], random_state = 1)

grid.fit(X_train, Y_train)

pred_train = grid.predict(X_train)
pred_test = grid.predict(X_test)

grid.score(X_test, Y_test)

print(grid.best_score_)
print(grid.best_estimator_)

print('\n',confusion_matrix(Y_test, pred_test))
print('\n',classification_report(Y_test, pred_test))

0.639047619047619
Pipeline(memory=None,
         steps=[('features',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('impute_and_one_hot',
                                                  Pipeline(memory=None,
                                                           steps=[('impute',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='most_frequent',
                          