In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, label_binarize
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

# random seed for reproducibility
seed = 123456789
np.random.seed(seed)

raw_data = pd.read_csv("International students Time management data.csv")

#Question 6: You often feel that your life is aimless, with no definite purpose
target_column = '6'
raw_data = raw_data[raw_data[target_column] != 'Neither']
raw_data = raw_data[raw_data[target_column].notna()]
raw_data[target_column] = raw_data[target_column].replace('Strong Agree', 'Agree')
raw_data[target_column] = raw_data[target_column].replace('Strong Disagree', 'Disagree')

In [None]:
impute_and_one_hot = Pipeline([
    ('impute', SimpleImputer(strategy= 'most_frequent')),
    ('encode', OneHotEncoder(handle_unknown='ignore'))
])

answers = ['Strong Disagree', 'Disagree', 'Neither', 'Agree', 'Strong Agree']
percentages = ['<40%', '40%~49%', '50%~59%', '60%~70%','>70%']

impute_and_ordinal = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[percentages,
                                           ['S4', 'S3', 'S2', 'S1', 'S0'], 
                                           percentages,
                                           ['<18', '18-20', '21-25', '26-30', '31-35', '>36'],
                                           answers, answers, answers, answers])),
    ('scaler', StandardScaler())
])

feature_encoding = ColumnTransformer([
    ("impute_and_one_hot", impute_and_one_hot, ['Course']),
    ("impute_and_ordinal", impute_and_ordinal, ['Academic', 'Attendance', 'English', 'Age',  
                                                '8', '12', '14', '15'])
])

pipeline = Pipeline([
    ('features', feature_encoding),
    ('tree', DecisionTreeClassifier())
])

In [None]:
hyperparam_grid = {
    "tree__splitter":["best","random"],
    "tree__max_depth" : list(range(1, 50)),
    "tree__min_samples_leaf":[1, 3, 5, 7, 9, 11, 13, 15],
    "tree__max_features":["auto","log2","sqrt", None],
    "tree__max_leaf_nodes":list(range(2, 50))
}

grid_search = GridSearchCV(pipeline, param_grid=hyperparam_grid, cv=3, n_jobs = -1, verbose = 1)

In [None]:
X_train, X_test, y_train_raw, y_test_raw = \
  train_test_split(raw_data, raw_data[target_column], test_size=.3, random_state=seed)

y_train = np.squeeze(label_binarize(y_train_raw, classes=['Agree', 'Disagree']))
y_test = np.squeeze(label_binarize(y_test_raw, classes=['Agree', 'Disagree']))

model = grid_search.fit(X_train, y_train)

print(model.best_params_)

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
confusion_matrix(y_test, model.predict(X_test))