In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, label_binarize
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

# random seed for reproducibility
seed = 123456789
np.random.seed(seed)

raw_data = pd.read_csv("International students Time management data.csv")

#Question 6: You often feel that your life is aimless, with no definite purpose
target_column = '6'
raw_data = raw_data[raw_data[target_column] != 'Neither']
raw_data = raw_data[raw_data[target_column].notna()]
raw_data[target_column] = raw_data[target_column].replace('Strong Agree', 'Agree')
raw_data[target_column] = raw_data[target_column].replace('Strong Disagree', 'Disagree')

In [78]:
impute_and_one_hot = Pipeline([
    ('impute', SimpleImputer(strategy= 'most_frequent')),
    ('encode', OneHotEncoder(handle_unknown='ignore'))
])

feature_encoding = ColumnTransformer([
    ("impute_and_one_hot", impute_and_one_hot, ['Course', 'Program', 'Attendance',
                                                '8','12', '14', '15']),
])

pipeline = Pipeline([
    ('features', feature_encoding),
    ('tree', DecisionTreeClassifier(random_state = 1))
])

In [79]:
hyperparam_grid = {
    "tree__max_depth" :[5, 10, 15, 20],
    "tree__max_features":["auto","log2","sqrt", None],
    "tree__max_leaf_nodes":[2, 4, 6, 8, 10],
    "tree__min_samples_split": [5, 8, 11, 14, 17]
}

grid_search = GridSearchCV(pipeline, param_grid=hyperparam_grid, cv=3, n_jobs=-1,return_train_score=True)

In [80]:
X_train, X_test, y_train_raw, y_test_raw = \
  train_test_split(raw_data, raw_data[target_column], test_size=.3, random_state=seed)

y_train = np.squeeze(label_binarize(y_train_raw, classes=['Agree', 'Disagree']))
y_test = np.squeeze(label_binarize(y_test_raw, classes=['Agree', 'Disagree']))

model = grid_search.fit(X_train, y_train)

print(model.best_params_)

{'tree__max_depth': 5, 'tree__max_features': 'auto', 'tree__max_leaf_nodes': 6, 'tree__min_samples_split': 5}


In [81]:
model.score(X_train, y_train)

0.7761194029850746

In [82]:
model.score(X_test, y_test)

0.7586206896551724

In [83]:
confusion_matrix(y_test, model.predict(X_test))

array([[ 5,  2],
       [ 5, 17]], dtype=int64)