# In class Activity

Student's adaptability level of online education

Generic Libraries

In [1]:
import numpy as np
import pandas as pd
import keras
from tensorflow.keras import Model




Loading data

In [2]:
df= pd.read_csv('archive (12).zip')
df.head()

Unnamed: 0,Gender,Age,Education Level,Institution Type,IT Student,Location,Load-shedding,Financial Condition,Internet Type,Network Type,Class Duration,Self Lms,Device,Adaptivity Level
0,Boy,21-25,University,Non Government,No,Yes,Low,Mid,Wifi,4G,3-6,No,Tab,Moderate
1,Girl,21-25,University,Non Government,No,Yes,High,Mid,Mobile Data,4G,1-3,Yes,Mobile,Moderate
2,Girl,16-20,College,Government,No,Yes,Low,Mid,Wifi,4G,1-3,No,Mobile,Moderate
3,Girl,11-15,School,Non Government,No,Yes,Low,Mid,Mobile Data,4G,1-3,No,Mobile,Moderate
4,Girl,16-20,School,Non Government,No,Yes,Low,Poor,Mobile Data,3G,0,No,Mobile,Low


In [3]:
df.shape

(1205, 14)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Gender               1205 non-null   object
 1   Age                  1205 non-null   object
 2   Education Level      1205 non-null   object
 3   Institution Type     1205 non-null   object
 4   IT Student           1205 non-null   object
 5   Location             1205 non-null   object
 6   Load-shedding        1205 non-null   object
 7   Financial Condition  1205 non-null   object
 8   Internet Type        1205 non-null   object
 9   Network Type         1205 non-null   object
 10  Class Duration       1205 non-null   object
 11  Self Lms             1205 non-null   object
 12  Device               1205 non-null   object
 13  Adaptivity Level     1205 non-null   object
dtypes: object(14)
memory usage: 131.9+ KB


All columns are object type we need to change datatype

In [5]:
df.isnull().sum()

Gender                 0
Age                    0
Education Level        0
Institution Type       0
IT Student             0
Location               0
Load-shedding          0
Financial Condition    0
Internet Type          0
Network Type           0
Class Duration         0
Self Lms               0
Device                 0
Adaptivity Level       0
dtype: int64

#The data is pretty clean so we will skip EDA and move towards encoding

Encoding Categorical data

We will do 2 types of encoding here binary encoding and custom handling

In [8]:
encoded_data = df.copy() #creating copy of data

In [9]:
binary_categories = ['Gender', 'IT Student', 'Location', 'Self Lms']
binary_mapping = {'No': 0, 'Yes': 1, 'Boy': 0, 'Girl': 1}

for column in binary_categories:
    encoded_data[column] = encoded_data[column].map(binary_mapping)
    
nominal_categories = ['Institution Type', 'Internet Type', 'Network Type', 'Device']
encoded_data = pd.get_dummies(encoded_data, columns=nominal_categories, drop_first=True)

ordinal_mapping = {
    'Financial Condition': {'Poor': 0, 'Mid': 1, 'Rich': 2},
    'Adaptivity Level': {'Low': 0, 'Moderate': 1, 'High': 2}
}

for column, mapping in ordinal_mapping.items():
    encoded_data[column] = encoded_data[column].map(mapping)

age_mapping = {'1-5': 0, '6-10': 1, '11-15': 2, '16-20': 3, '21-25': 4, '26-30': 5}
class_duration_mapping = {'0': 0, '1-3': 1, '3-6': 2}

encoded_data['Age'] = encoded_data['Age'].map(age_mapping)
encoded_data['Class Duration'] = encoded_data['Class Duration'].map(class_duration_mapping)
education_level_mapping = {'School': 0, 'College': 1, 'University': 2}
load_shedding_mapping = {'Low': 0, 'High': 1}

encoded_data['Education Level'] = encoded_data['Education Level'].map(education_level_mapping)
encoded_data['Load-shedding'] = encoded_data['Load-shedding'].map(load_shedding_mapping)

checking for encoded data

In [11]:
encoded_data.head()

Unnamed: 0,Gender,Age,Education Level,IT Student,Location,Load-shedding,Financial Condition,Class Duration,Self Lms,Adaptivity Level,Institution Type_Non Government,Internet Type_Wifi,Network Type_3G,Network Type_4G,Device_Mobile,Device_Tab
0,0,4,2,0,1,0,1,2,0,1,1,1,0,1,0,1
1,1,4,2,0,1,1,1,1,1,1,1,0,0,1,1,0
2,1,3,1,0,1,0,1,1,0,1,0,1,0,1,1,0
3,1,2,0,0,1,0,1,1,0,1,1,0,0,1,1,0
4,1,3,0,0,1,0,0,0,0,0,1,0,1,0,1,0


In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

Defining X and y

In [14]:
#Separating the features and the target variable
X = encoded_data.drop('Adaptivity Level', axis=1)
y = encoded_data['Adaptivity Level']

In [15]:
#Splitting the dataset into training and testing sets again
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
X

Unnamed: 0,Gender,Age,Education Level,IT Student,Location,Load-shedding,Financial Condition,Class Duration,Self Lms,Institution Type_Non Government,Internet Type_Wifi,Network Type_3G,Network Type_4G,Device_Mobile,Device_Tab
0,0,4,2,0,1,0,1,2,0,1,1,0,1,0,1
1,1,4,2,0,1,1,1,1,1,1,0,0,1,1,0
2,1,3,1,0,1,0,1,1,0,0,1,0,1,1,0
3,1,2,0,0,1,0,1,1,0,1,0,0,1,1,0
4,1,3,0,0,1,0,0,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1200,1,3,1,0,1,0,1,1,0,1,1,0,1,1,0
1201,1,3,1,0,0,1,1,2,0,1,1,0,1,1,0
1202,0,2,0,0,1,0,1,1,0,1,0,1,0,1,0
1203,1,3,1,0,0,0,1,1,0,1,1,0,1,1,0


In [18]:
#Re-scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
X_train_scaled

array([[ 1.12143267, -2.28533617, -1.04488442, ...,  0.73647811,
         0.43096619, -0.16316883],
       [-0.89171648, -0.68601561, -1.04488442, ..., -1.35781362,
         0.43096619, -0.16316883],
       [-0.89171648, -0.68601561, -1.04488442, ..., -1.35781362,
         0.43096619, -0.16316883],
       ...,
       [-0.89171648, -0.68601561, -1.04488442, ...,  0.73647811,
         0.43096619, -0.16316883],
       [-0.89171648,  0.91330494,  1.17621498, ...,  0.73647811,
         0.43096619, -0.16316883],
       [-0.89171648,  0.91330494,  1.17621498, ..., -1.35781362,
        -2.32036763, -0.16316883]])

In [20]:
X_test_scaled 

array([[ 1.12143267, -0.68601561, -1.04488442, ...,  0.73647811,
        -2.32036763, -0.16316883],
       [-0.89171648,  0.11364467,  0.06566528, ...,  0.73647811,
         0.43096619, -0.16316883],
       [-0.89171648,  0.11364467,  0.06566528, ...,  0.73647811,
        -2.32036763,  6.12862138],
       ...,
       [ 1.12143267, -1.48567589, -1.04488442, ..., -1.35781362,
         0.43096619, -0.16316883],
       [-0.89171648, -0.68601561, -1.04488442, ..., -1.35781362,
         0.43096619, -0.16316883],
       [ 1.12143267,  0.91330494,  1.17621498, ...,  0.73647811,
        -2.32036763, -0.16316883]])

# Model Implementation

In [21]:
models = {
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Multi-Layer Perceptron": MLPClassifier(max_iter=1000)
}

In [22]:
#Re-applying each model and storing results
results = {}
for model_name, model in models.items():
    # Training the model
    model.fit(X_train_scaled, y_train)

Predictions

In [23]:
#Making predictions
y_pred = model.predict(X_test_scaled)

Evaluating Model

In [24]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [25]:
#Storing the results
results[model_name] = {"accuracy": accuracy, "report": report}

results

{'Multi-Layer Perceptron': {'accuracy': 0.8879668049792531,
  'report': '              precision    recall  f1-score   support\n\n           0       0.90      0.91      0.91       103\n           1       0.88      0.91      0.89       115\n           2       0.88      0.65      0.75        23\n\n    accuracy                           0.89       241\n   macro avg       0.89      0.83      0.85       241\nweighted avg       0.89      0.89      0.89       241\n'}}

# Implementing Hyperparameter tuning for improving accuracy 

In [26]:
from sklearn.model_selection import GridSearchCV

# Parameters grid to tune
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

Proceeding with Random Forest model because it fits best and get 89% accuracy

In [27]:
#Random Forest model
rf = RandomForestClassifier(random_state=42)

In [28]:
# Grid search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

In [29]:
# Fitting the grid search to the data
grid_search.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


324 fits failed out of a total of 648.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
136 fits failed with the following error:
Traceback (most recent call last):
  File "D:\DATA SCIENCE\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\DATA SCIENCE\anaconda\Lib\site-packages\sklearn\base.py", line 1344, in wrapper
    estimator._validate_params()
  File "D:\DATA SCIENCE\anaconda\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "D:\DATA SCIENCE\anaconda\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParame

In [30]:
# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score

({'max_depth': None,
  'max_features': 'sqrt',
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 200},
 0.8910850538237778)

No improvement in accuracy so there is no need of hyperparameter tuning on this particular dataset

Let's check for decision tree model as well

In [31]:
from sklearn.model_selection import RandomizedSearchCV

#Decision Tree model
dt = DecisionTreeClassifier(random_state=42)

In [32]:
param_dist = {
    'max_depth': [None] + list(np.arange(3, 20)),
    'min_samples_split': np.arange(2, 20),
    'min_samples_leaf': np.arange(1, 20),
    'max_features': [None, 'auto', 'sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

In [33]:
# Randomized search with cross-validation
random_search = RandomizedSearchCV(estimator=dt, param_distributions=param_dist, 
n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1, scoring='accuracy')


In [34]:
# Fitting the random search to the data
random_search.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


69 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
39 fits failed with the following error:
Traceback (most recent call last):
  File "D:\DATA SCIENCE\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\DATA SCIENCE\anaconda\Lib\site-packages\sklearn\base.py", line 1344, in wrapper
    estimator._validate_params()
  File "D:\DATA SCIENCE\anaconda\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "D:\DATA SCIENCE\anaconda\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParamete

In [35]:
# Best parameters and best score
best_params_dt = random_search.best_params_
best_score_dt = random_search.best_score_

best_params_dt, best_score_dt

({'min_samples_split': 10,
  'min_samples_leaf': 1,
  'max_features': 'log2',
  'max_depth': None,
  'criterion': 'entropy'},
 0.795643789142367)

The accurancy of decision tree model has decreased by 10% after hyperparameter tuning

# Conclusion

# Possible reasons for this is that
1. The random forest model is already performing well
2. Other factors are affecting the accuracy such as data is insufficient
3. The model does not generalize to test data

# Applying Cross validation and checking for tuning results

In [36]:
#Create a grid search object with cross-validation
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')

In [37]:
#Fit the grid search on the train set
grid_search.fit(X_train, y_train)

540 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "D:\DATA SCIENCE\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\DATA SCIENCE\anaconda\Lib\site-packages\sklearn\base.py", line 1344, in wrapper
    estimator._validate_params()
  File "D:\DATA SCIENCE\anaconda\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "D:\DATA SCIENCE\anaconda\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParam

In [43]:
#best hyperparameters and the best score
print('Best hyperparameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

Best hyperparameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best score: 0.9056077288428325


Prediciting

In [39]:
y_pred = grid_search.predict(X_test)

Evaluating Model

In [45]:
from sklearn.metrics import confusion_matrix
print('Accuracy on test set:', accuracy_score(y_test, y_pred))
print('Confusion matrix on test set:\n', confusion_matrix(y_test, y_pred))

Accuracy on test set: 0.8962655601659751
Confusion matrix on test set:
 [[ 97   4   2]
 [ 11 104   0]
 [  0   8  15]]


No increase in accuracy means overall model is performing well enough 