# Tree-based Algorithm: Classifier 

In [26]:
import pandas as pd

In [27]:
df = pd.read_csv('digital_learning_analytics_100k.csv')

# Preprocessing: Handling Missing Values 

In [28]:
drop_cols = ['learner_id', 'enrollment_date', 'last_activity_date', 'age', 'gender', 'country']

df.drop(columns=[col for col in df.columns if col in drop_cols], inplace=True)

In [29]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 37 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   education_level                   100000 non-null  str    
 1   employment_status                 100000 non-null  str    
 2   prior_online_courses              100000 non-null  int64  
 3   digital_literacy_score            100000 non-null  float64
 4   app_category                      100000 non-null  str    
 5   daily_app_minutes                 100000 non-null  float64
 6   session_count_weekly              100000 non-null  int64  
 7   app_completion_rate               100000 non-null  float64
 8   in_app_quiz_score                 100000 non-null  float64
 9   gamification_engagement           97510 non-null   float64
 10  skill_pre_score                   100000 non-null  float64
 11  skill_post_score                  100000 non-null  float64
 12  

In [30]:
def handle_miss_values(df):
    for col in df.columns: 
        if df[col].isnull().any():
            if df[col].dtype == 'str':
                df[col] = df[col].fillna(df[col].mode()[0])
            else:
                df[col] = df[col].fillna(df[col].mean())
    return df
df = handle_miss_values(df)


In [31]:
df.isna().sum()

education_level                     0
employment_status                   0
prior_online_courses                0
digital_literacy_score              0
app_category                        0
daily_app_minutes                   0
session_count_weekly                0
app_completion_rate                 0
in_app_quiz_score                   0
gamification_engagement             0
skill_pre_score                     0
skill_post_score                    0
essay_topic_category                0
essay_word_count                    0
essay_grammar_errors                0
essay_vocabulary_richness           0
essay_coherence_score               0
human_grader_score                  0
automated_score                     0
mooc_platform                       0
course_category                     0
course_duration_weeks               0
video_completion_pct                0
assignment_submission_rate          0
forum_posts                         0
peer_review_given                   0
course_compl

In [32]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 37 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   education_level                   100000 non-null  str    
 1   employment_status                 100000 non-null  str    
 2   prior_online_courses              100000 non-null  int64  
 3   digital_literacy_score            100000 non-null  float64
 4   app_category                      100000 non-null  str    
 5   daily_app_minutes                 100000 non-null  float64
 6   session_count_weekly              100000 non-null  int64  
 7   app_completion_rate               100000 non-null  float64
 8   in_app_quiz_score                 100000 non-null  float64
 9   gamification_engagement           100000 non-null  float64
 10  skill_pre_score                   100000 non-null  float64
 11  skill_post_score                  100000 non-null  float64
 12  

# Preprocessing: Encoding 

In [33]:
from sklearn.preprocessing import LabelEncoder

In [47]:
def do_encoding(df):
    encoder = LabelEncoder()
    for col in df.columns:
        if df[col].dtype == 'str':
            if df[col].nunique() < 5:
                dummies = pd.get_dummies(df[col], prefix=col, dtype=int)
                df.drop(columns=[col],inplace=True)
                df[dummies.columns] = dummies
            else:
                df[col] = encoder.fit_transform(df[col])
    return df
df = do_encoding(df)


In [48]:
df.head()

Unnamed: 0,education_level,employment_status,prior_online_courses,digital_literacy_score,app_category,daily_app_minutes,session_count_weekly,app_completion_rate,in_app_quiz_score,gamification_engagement,...,remediation_modules_completed,time_to_mastery_hours,mastery_score,learning_efficiency_score,total_learning_hours,engagement_consistency,learning_path_type_Branched,learning_path_type_Fully Adaptive,learning_path_type_Hybrid,learning_path_type_Linear
0,1,4,0.0,0.763571,5,0.457143,0.333333,0.713981,0.619048,0.132323,...,0.0,0.136986,0.43235,0.244779,0.032971,0.527,1,0,0,0
1,1,2,0.14,0.646562,1,0.281143,0.291667,0.734258,0.703833,0.070707,...,0.25,0.154421,0.114954,0.063835,0.033764,0.441,0,0,0,1
2,3,0,0.12,0.733414,0,0.276571,0.041667,0.522946,0.699187,0.311111,...,0.0,0.753425,0.422177,0.06775,0.03705,0.494,0,0,0,1
3,0,3,0.1,0.667069,5,0.316,0.25,0.584845,0.875726,0.420202,...,0.0,0.362391,0.436419,0.130399,0.041468,0.493,1,0,0,0
4,4,5,0.1,0.369119,4,0.153143,0.125,0.184632,0.889663,0.401731,...,0.0,0.530511,0.339776,0.074158,0.03603,0.305,1,0,0,0


In [49]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 40 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   education_level                    100000 non-null  int64  
 1   employment_status                  100000 non-null  int64  
 2   prior_online_courses               100000 non-null  float64
 3   digital_literacy_score             100000 non-null  float64
 4   app_category                       100000 non-null  int64  
 5   daily_app_minutes                  100000 non-null  float64
 6   session_count_weekly               100000 non-null  float64
 7   app_completion_rate                100000 non-null  float64
 8   in_app_quiz_score                  100000 non-null  float64
 9   gamification_engagement            100000 non-null  float64
 10  skill_pre_score                    100000 non-null  float64
 11  skill_post_score                   100000 non-null 

# Data Preprocessing: Scaling 

In [50]:
df.head(10)
# As you can see there some big difference among numbers 


Unnamed: 0,education_level,employment_status,prior_online_courses,digital_literacy_score,app_category,daily_app_minutes,session_count_weekly,app_completion_rate,in_app_quiz_score,gamification_engagement,...,remediation_modules_completed,time_to_mastery_hours,mastery_score,learning_efficiency_score,total_learning_hours,engagement_consistency,learning_path_type_Branched,learning_path_type_Fully Adaptive,learning_path_type_Hybrid,learning_path_type_Linear
0,1,4,0.0,0.763571,5,0.457143,0.333333,0.713981,0.619048,0.132323,...,0.0,0.136986,0.43235,0.244779,0.032971,0.527,1,0,0,0
1,1,2,0.14,0.646562,1,0.281143,0.291667,0.734258,0.703833,0.070707,...,0.25,0.154421,0.114954,0.063835,0.033764,0.441,0,0,0,1
2,3,0,0.12,0.733414,0,0.276571,0.041667,0.522946,0.699187,0.311111,...,0.0,0.753425,0.422177,0.06775,0.03705,0.494,0,0,0,1
3,0,3,0.1,0.667069,5,0.316,0.25,0.584845,0.875726,0.420202,...,0.0,0.362391,0.436419,0.130399,0.041468,0.493,1,0,0,0
4,4,5,0.1,0.369119,4,0.153143,0.125,0.184632,0.889663,0.401731,...,0.0,0.530511,0.339776,0.074158,0.03603,0.305,1,0,0,0
5,1,0,0.0,0.800965,3,0.18,0.083333,0.693703,1.0,0.677778,...,0.5,0.183064,0.68057,0.324514,0.03603,0.9,0,1,0,0
6,1,4,0.12,0.60193,0,0.069143,0.0,0.242263,1.0,0.176768,...,0.125,0.196762,0.560529,0.256289,0.009631,0.539,0,1,0,0
7,1,4,0.14,0.410133,0,0.034857,0.125,0.436499,0.933798,0.538384,...,0.25,0.176837,0.615463,0.299953,0.009857,0.685,0,0,1,0
8,4,0,0.34,0.487334,4,0.225143,0.208333,0.217716,0.772358,0.440404,...,0.125,0.330012,0.667345,0.213811,0.039995,0.486,0,1,0,0
9,1,5,0.0,0.576598,5,0.197714,0.125,0.175027,0.808362,0.150505,...,0.125,0.344956,0.438454,0.135975,0.029232,1.0,0,1,0,0


In [51]:
from sklearn.preprocessing import MinMaxScaler


In [52]:
def do_scaling(df):
    scaler = MinMaxScaler()
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns

    if 'course_completed' in num_cols:
        num_cols = num_cols.drop('course_completed')
    df[num_cols] = scaler.fit_transform(df[num_cols])
    return df

df = do_scaling(df)

In [53]:
df.head(10)

Unnamed: 0,education_level,employment_status,prior_online_courses,digital_literacy_score,app_category,daily_app_minutes,session_count_weekly,app_completion_rate,in_app_quiz_score,gamification_engagement,...,remediation_modules_completed,time_to_mastery_hours,mastery_score,learning_efficiency_score,total_learning_hours,engagement_consistency,learning_path_type_Branched,learning_path_type_Fully Adaptive,learning_path_type_Hybrid,learning_path_type_Linear
0,0.2,0.8,0.0,0.763571,1.0,0.457143,0.333333,0.713981,0.619048,0.132323,...,0.0,0.136986,0.43235,0.244779,0.032971,0.527,1.0,0.0,0.0,0.0
1,0.2,0.4,0.14,0.646562,0.2,0.281143,0.291667,0.734258,0.703833,0.070707,...,0.25,0.154421,0.114954,0.063835,0.033764,0.441,0.0,0.0,0.0,1.0
2,0.6,0.0,0.12,0.733414,0.0,0.276571,0.041667,0.522946,0.699187,0.311111,...,0.0,0.753425,0.422177,0.06775,0.03705,0.494,0.0,0.0,0.0,1.0
3,0.0,0.6,0.1,0.667069,1.0,0.316,0.25,0.584845,0.875726,0.420202,...,0.0,0.362391,0.436419,0.130399,0.041468,0.493,1.0,0.0,0.0,0.0
4,0.8,1.0,0.1,0.369119,0.8,0.153143,0.125,0.184632,0.889663,0.401731,...,0.0,0.530511,0.339776,0.074158,0.03603,0.305,1.0,0.0,0.0,0.0
5,0.2,0.0,0.0,0.800965,0.6,0.18,0.083333,0.693703,1.0,0.677778,...,0.5,0.183064,0.68057,0.324514,0.03603,0.9,0.0,1.0,0.0,0.0
6,0.2,0.8,0.12,0.60193,0.0,0.069143,0.0,0.242263,1.0,0.176768,...,0.125,0.196762,0.560529,0.256289,0.009631,0.539,0.0,1.0,0.0,0.0
7,0.2,0.8,0.14,0.410133,0.0,0.034857,0.125,0.436499,0.933798,0.538384,...,0.25,0.176837,0.615463,0.299953,0.009857,0.685,0.0,0.0,1.0,0.0
8,0.8,0.0,0.34,0.487334,0.8,0.225143,0.208333,0.217716,0.772358,0.440404,...,0.125,0.330012,0.667345,0.213811,0.039995,0.486,0.0,1.0,0.0,0.0
9,0.2,1.0,0.0,0.576598,1.0,0.197714,0.125,0.175027,0.808362,0.150505,...,0.125,0.344956,0.438454,0.135975,0.029232,1.0,0.0,1.0,0.0,0.0


# Algorithm Selection: DECISION TREE REGRESSOR

In [54]:
from sklearn.tree import DecisionTreeClassifier

In [55]:
dt_class = DecisionTreeClassifier()
dt_class

0,1,2
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`.",'gini'
,"splitter  splitter: {""best"", ""random""}, default=""best"" The strategy used to choose the split at each node. Supported strategies are ""best"" to choose the best split and ""random"" to choose the best random split.",'best'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: int, float or {""sqrt"", ""log2""}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at  each split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. note::  The search for a split does not stop until at least one  valid partition of the node samples is found, even if it requires to  effectively inspect more than ``max_features`` features.",
,"random_state  random_state: int, RandomState instance or None, default=None Controls the randomness of the estimator. The features are always randomly permuted at each split, even if ``splitter`` is set to ``""best""``. When ``max_features < n_features``, the algorithm will select ``max_features`` at random at each split before finding the best split among them. But the best found split may vary across different runs, even if ``max_features=n_features``. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed to an integer. See :term:`Glossary ` for details.",
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0


# Model Splitting 

In [56]:
from sklearn.model_selection import train_test_split

In [57]:
x = df.drop('course_completed', axis=1)
y = df['course_completed']

In [58]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size = 0.2, random_state = 42
)

In [59]:
dt_class_model = dt_class.fit(x_train, y_train)

# Model Prediction 

In [60]:
y_pred = dt_class_model.predict(x_test).astype(int)

In [61]:
y_pred[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [62]:
from sklearn.metrics import classification_report, confusion_matrix

In [65]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[12177  1713]
 [ 1547  4563]]
              precision    recall  f1-score   support

       False       0.89      0.88      0.88     13890
        True       0.73      0.75      0.74      6110

    accuracy                           0.84     20000
   macro avg       0.81      0.81      0.81     20000
weighted avg       0.84      0.84      0.84     20000

