# Linear Regression

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('digital_learning_analytics_100k.csv')

# Preprocessing: Handling Missing Values 

In [3]:
drop_cols = ['learner_id', 'enrollment_date', 'last_activity_date', 'age', 'gender', 'country']

df.drop(columns=[col for col in df.columns if col in drop_cols], inplace=True)

In [4]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 37 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   education_level                   100000 non-null  str    
 1   employment_status                 100000 non-null  str    
 2   prior_online_courses              100000 non-null  int64  
 3   digital_literacy_score            100000 non-null  float64
 4   app_category                      100000 non-null  str    
 5   daily_app_minutes                 100000 non-null  float64
 6   session_count_weekly              100000 non-null  int64  
 7   app_completion_rate               100000 non-null  float64
 8   in_app_quiz_score                 100000 non-null  float64
 9   gamification_engagement           97510 non-null   float64
 10  skill_pre_score                   100000 non-null  float64
 11  skill_post_score                  100000 non-null  float64
 12  

In [6]:
def handle_miss_values(df):
    for col in df.columns: 
        if df[col].isnull().any():
            if df[col].dtype == 'str':
                df[col] = df[col].fillna(df[col].mode()[0], inplace=True)
            else:
                df[col] = df[col].fillna(df[col].mean(), inplace=True)
    return df
df = handle_miss_values(df)


In [7]:
df.isna().sum()

education_level                     0
employment_status                   0
prior_online_courses                0
digital_literacy_score              0
app_category                        0
daily_app_minutes                   0
session_count_weekly                0
app_completion_rate                 0
in_app_quiz_score                   0
gamification_engagement             0
skill_pre_score                     0
skill_post_score                    0
essay_topic_category                0
essay_word_count                    0
essay_grammar_errors                0
essay_vocabulary_richness           0
essay_coherence_score               0
human_grader_score                  0
automated_score                     0
mooc_platform                       0
course_category                     0
course_duration_weeks               0
video_completion_pct                0
assignment_submission_rate          0
forum_posts                         0
peer_review_given                   0
course_compl

In [8]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 37 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   education_level                   100000 non-null  str    
 1   employment_status                 100000 non-null  str    
 2   prior_online_courses              100000 non-null  int64  
 3   digital_literacy_score            100000 non-null  float64
 4   app_category                      100000 non-null  str    
 5   daily_app_minutes                 100000 non-null  float64
 6   session_count_weekly              100000 non-null  int64  
 7   app_completion_rate               100000 non-null  float64
 8   in_app_quiz_score                 100000 non-null  float64
 9   gamification_engagement           100000 non-null  float64
 10  skill_pre_score                   100000 non-null  float64
 11  skill_post_score                  100000 non-null  float64
 12  

# Preprocessing: Encoding 

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
def do_encoding(df):
    encoder = LabelEncoder()
    for col in df.columns:
        if df[col].dtype == 'str':
            if df[col].nunique() < 5:
                dummies = pd.get_dummies(df[col], prefix=col, dtype=int)
                df.drop(columns=[col],inplace=True)
                df[dummies.columns] = dummies
            else:
                df[col] = encoder.fit_transform(df[col])
    return df
df = do_encoding(df)


In [11]:
df.head()

Unnamed: 0,education_level,employment_status,prior_online_courses,digital_literacy_score,app_category,daily_app_minutes,session_count_weekly,app_completion_rate,in_app_quiz_score,gamification_engagement,...,remediation_modules_completed,time_to_mastery_hours,mastery_score,learning_efficiency_score,total_learning_hours,engagement_consistency,learning_path_type_Branched,learning_path_type_Fully Adaptive,learning_path_type_Hybrid,learning_path_type_Linear
0,1,4,0,8.04,5,85.0,9,71.9,67.2,13.3,...,0,21.0,44.2,21.05,34.2,0.527,1,0,0,0
1,1,2,7,7.07,1,54.2,8,73.8,74.5,7.2,...,2,22.4,13.0,5.8,34.9,0.441,0,0,0,1
2,3,0,6,7.79,0,53.4,2,54.0,74.1,31.0,...,0,70.5,43.2,6.13,37.8,0.494,0,0,0,1
3,0,3,5,7.24,5,60.3,7,59.8,89.3,41.8,...,0,39.1,44.6,11.41,41.7,0.493,1,0,0,0
4,4,5,5,4.77,4,31.8,4,22.3,90.5,39.971342,...,0,52.6,35.1,6.67,36.9,0.305,1,0,0,0


In [12]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 40 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   education_level                    100000 non-null  int64  
 1   employment_status                  100000 non-null  int64  
 2   prior_online_courses               100000 non-null  int64  
 3   digital_literacy_score             100000 non-null  float64
 4   app_category                       100000 non-null  int64  
 5   daily_app_minutes                  100000 non-null  float64
 6   session_count_weekly               100000 non-null  int64  
 7   app_completion_rate                100000 non-null  float64
 8   in_app_quiz_score                  100000 non-null  float64
 9   gamification_engagement            100000 non-null  float64
 10  skill_pre_score                    100000 non-null  float64
 11  skill_post_score                   100000 non-null 

# Data Preprocessing: Scaling 

In [13]:
df.head(10)
# As you can see there some big difference among numbers 


Unnamed: 0,education_level,employment_status,prior_online_courses,digital_literacy_score,app_category,daily_app_minutes,session_count_weekly,app_completion_rate,in_app_quiz_score,gamification_engagement,...,remediation_modules_completed,time_to_mastery_hours,mastery_score,learning_efficiency_score,total_learning_hours,engagement_consistency,learning_path_type_Branched,learning_path_type_Fully Adaptive,learning_path_type_Hybrid,learning_path_type_Linear
0,1,4,0,8.04,5,85.0,9,71.9,67.2,13.3,...,0,21.0,44.2,21.05,34.2,0.527,1,0,0,0
1,1,2,7,7.07,1,54.2,8,73.8,74.5,7.2,...,2,22.4,13.0,5.8,34.9,0.441,0,0,0,1
2,3,0,6,7.79,0,53.4,2,54.0,74.1,31.0,...,0,70.5,43.2,6.13,37.8,0.494,0,0,0,1
3,0,3,5,7.24,5,60.3,7,59.8,89.3,41.8,...,0,39.1,44.6,11.41,41.7,0.493,1,0,0,0
4,4,5,5,4.77,4,31.8,4,22.3,90.5,39.971342,...,0,52.6,35.1,6.67,36.9,0.305,1,0,0,0
5,1,0,0,8.35,3,36.5,3,70.0,100.0,67.3,...,4,24.7,68.6,27.77,36.9,0.9,0,1,0,0
6,1,4,6,6.7,0,17.1,1,27.7,100.0,17.7,...,1,25.8,56.8,22.02,13.6,0.539,0,1,0,0
7,1,4,7,5.11,0,11.1,4,45.9,94.3,53.5,...,2,24.2,62.2,25.7,13.8,0.685,0,0,1,0
8,4,0,17,5.75,4,44.4,6,25.4,80.4,43.8,...,1,36.5,67.3,18.44,40.4,0.486,0,1,0,0
9,1,5,0,6.49,5,39.6,4,21.4,83.5,15.1,...,1,37.7,44.8,11.88,30.9,1.0,0,1,0,0


In [14]:
from sklearn.preprocessing import MinMaxScaler


In [16]:
def do_scaling(df):
    scaler = MinMaxScaler()
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns

    if 'learning_efficiency_score' in num_cols:
        num_cols = num_cols.drop('learning_efficiency_score')
    df[num_cols] = scaler.fit_transform(df[num_cols])
    return df

df = do_scaling(df)

In [17]:
df.head(10)

Unnamed: 0,education_level,employment_status,prior_online_courses,digital_literacy_score,app_category,daily_app_minutes,session_count_weekly,app_completion_rate,in_app_quiz_score,gamification_engagement,...,remediation_modules_completed,time_to_mastery_hours,mastery_score,learning_efficiency_score,total_learning_hours,engagement_consistency,learning_path_type_Branched,learning_path_type_Fully Adaptive,learning_path_type_Hybrid,learning_path_type_Linear
0,0.2,0.8,0.0,0.763571,1.0,0.457143,0.333333,0.713981,0.619048,0.132323,...,0.0,0.136986,0.43235,21.05,0.032971,0.527,1.0,0.0,0.0,0.0
1,0.2,0.4,0.14,0.646562,0.2,0.281143,0.291667,0.734258,0.703833,0.070707,...,0.25,0.154421,0.114954,5.8,0.033764,0.441,0.0,0.0,0.0,1.0
2,0.6,0.0,0.12,0.733414,0.0,0.276571,0.041667,0.522946,0.699187,0.311111,...,0.0,0.753425,0.422177,6.13,0.03705,0.494,0.0,0.0,0.0,1.0
3,0.0,0.6,0.1,0.667069,1.0,0.316,0.25,0.584845,0.875726,0.420202,...,0.0,0.362391,0.436419,11.41,0.041468,0.493,1.0,0.0,0.0,0.0
4,0.8,1.0,0.1,0.369119,0.8,0.153143,0.125,0.184632,0.889663,0.401731,...,0.0,0.530511,0.339776,6.67,0.03603,0.305,1.0,0.0,0.0,0.0
5,0.2,0.0,0.0,0.800965,0.6,0.18,0.083333,0.693703,1.0,0.677778,...,0.5,0.183064,0.68057,27.77,0.03603,0.9,0.0,1.0,0.0,0.0
6,0.2,0.8,0.12,0.60193,0.0,0.069143,0.0,0.242263,1.0,0.176768,...,0.125,0.196762,0.560529,22.02,0.009631,0.539,0.0,1.0,0.0,0.0
7,0.2,0.8,0.14,0.410133,0.0,0.034857,0.125,0.436499,0.933798,0.538384,...,0.25,0.176837,0.615463,25.7,0.009857,0.685,0.0,0.0,1.0,0.0
8,0.8,0.0,0.34,0.487334,0.8,0.225143,0.208333,0.217716,0.772358,0.440404,...,0.125,0.330012,0.667345,18.44,0.039995,0.486,0.0,1.0,0.0,0.0
9,0.2,1.0,0.0,0.576598,1.0,0.197714,0.125,0.175027,0.808362,0.150505,...,0.125,0.344956,0.438454,11.88,0.029232,1.0,0.0,1.0,0.0,0.0


# Algorithm Selection: LINEAR REGRESSION

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
lr_reg = LinearRegression()
lr_reg

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


# Model Splitting 

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
x = df.drop('learning_efficiency_score', axis=1)
y = df['learning_efficiency_score']

In [22]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size = 0.2, random_state = 42
)

In [23]:
lr_reg_model = lr_reg.fit(x_train, y_train)

# Model Prediction 

In [27]:
y_pred = lr_reg_model.predict(x_test).astype(int)

In [28]:
y_pred[0:10]

array([19,  9, 14, 13, 11, 24, 14, 21, 22, 21])

# Model Evaluation: mae, mse, rmse, and r2_score

In [29]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [32]:
mse = mean_squared_error(y_test, y_pred)
print('r2_score: ', r2_score(y_test, y_pred))
print('mae: ', mean_absolute_error(y_test, y_pred))
print('mse: ', mean_squared_error(y_test, y_pred))
print('rmse: ', pow(mse, (1/2)))


r2_score:  0.7077812647586952
mae:  2.485422051687007
mse:  21.093648856738444
rmse:  4.592782256621627
