In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import (
    RandomOverSampler,
    SMOTE,
    ADASYN,
    BorderlineSMOTE,
    SVMSMOTE
)

In [None]:
# kagglehub.dataset_download("rabieelkharoua/predict-online-course-engagement-dataset")

course = pd.read_csv("C:\\Users\\danis\\Downloads\\Datasets smote\\online_course_engagement_data.csv")

In [3]:
course

Unnamed: 0,UserID,CourseCategory,TimeSpentOnCourse,NumberOfVideosWatched,NumberOfQuizzesTaken,QuizScores,CompletionRate,DeviceType,CourseCompletion
0,5618,Health,29.979719,17,3,50.365656,20.860773,1,0
1,4326,Arts,27.802640,1,5,62.615970,65.632415,1,0
2,5849,Arts,86.820485,14,2,78.458962,63.812007,1,1
3,4992,Science,35.038427,17,10,59.198853,95.433162,0,1
4,3866,Programming,92.490647,16,0,98.428285,18.102478,0,0
...,...,...,...,...,...,...,...,...,...
8995,8757,Health,37.445225,14,4,54.469359,32.990704,1,0
8996,894,Science,48.631443,7,7,59.413257,0.254625,0,0
8997,6323,Health,38.212512,3,3,69.508297,70.188159,1,0
8998,3652,Health,70.048665,13,10,79.655182,72.975225,1,1


In [4]:
del course['UserID']
del course['CourseCategory']

In [5]:
course.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   TimeSpentOnCourse      9000 non-null   float64
 1   NumberOfVideosWatched  9000 non-null   int64  
 2   NumberOfQuizzesTaken   9000 non-null   int64  
 3   QuizScores             9000 non-null   float64
 4   CompletionRate         9000 non-null   float64
 5   DeviceType             9000 non-null   int64  
 6   CourseCompletion       9000 non-null   int64  
dtypes: float64(3), int64(4)
memory usage: 492.3 KB


In [6]:
from imblearn.datasets import make_imbalance
from sklearn.model_selection import train_test_split
from collections import Counter

In [7]:
course['CourseCompletion'].value_counts()

CourseCompletion
0    5432
1    3568
Name: count, dtype: int64

In [8]:
df_resampled, y_resampled = make_imbalance(course, course['CourseCompletion'], sampling_strategy={ 0 : 5432, 1: 1500},random_state=42)
df_resampled2 = df_resampled.copy()     # saving same data to be used for our proposed method 
print("Original class distribution:", Counter(y_resampled))
del df_resampled['CourseCompletion']

Original class distribution: Counter({0: 5432, 1: 1500})


In [9]:
def evaluate_oversampling(X, y, sampler, classifier):

    # Train-test split on resampled data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)
    print("Class distribution after SMOTE:", Counter(y_train_resampled))

    
    # Train the classifier
    classifier.fit(X_train_resampled, y_train_resampled)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    return accuracy, precision, recall, f1, X_test, y_test

oversamplers = {
    "RandomOverSampler": RandomOverSampler(sampling_strategy='minority'),
    "SMOTE": SMOTE(sampling_strategy='minority'),
    "ADASYN": ADASYN(sampling_strategy='minority'),
    "BorderlineSMOTE": BorderlineSMOTE(sampling_strategy='minority'),
    "SVMSMOTE": SVMSMOTE(sampling_strategy='minority')
}
resultsNB = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=GaussianNB())
    resultsNB.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsNB = pd.DataFrame(resultsNB)

# Print results
print(resultsNB)

Class distribution after SMOTE: Counter({0: 4344, 1: 4344})
Class distribution after SMOTE: Counter({0: 4344, 1: 4344})
Class distribution after SMOTE: Counter({0: 4344, 1: 4291})
Class distribution after SMOTE: Counter({0: 4344, 1: 4344})
Class distribution after SMOTE: Counter({0: 4344, 1: 4344})
              Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.820476   0.853129  0.820476  0.830297
1              SMOTE  0.839221   0.859021  0.839221  0.845831
2             ADASYN  0.813987   0.847492  0.813987  0.824222
3    BorderlineSMOTE  0.819755   0.849607  0.819755  0.829081
4           SVMSMOTE  0.818313   0.858095  0.818313  0.829403


In [10]:
resultsKNN = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=KNeighborsClassifier())
    resultsKNN.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsKNN = pd.DataFrame(resultsKNN)

# Print results
print(resultsKNN)

Class distribution after SMOTE: Counter({0: 4344, 1: 4344})
Class distribution after SMOTE: Counter({0: 4344, 1: 4344})
Class distribution after SMOTE: Counter({0: 4344, 1: 4291})
Class distribution after SMOTE: Counter({0: 4344, 1: 4344})
Class distribution after SMOTE: Counter({0: 4344, 1: 4344})
              Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.797404   0.840018  0.797404  0.809941
1              SMOTE  0.809661   0.850186  0.809661  0.821279
2             ADASYN  0.779380   0.844669  0.779380  0.795945
3    BorderlineSMOTE  0.806056   0.848613  0.806056  0.818165
4           SVMSMOTE  0.830570   0.852672  0.830570  0.837932


In [11]:
resultsRF = []
for name, sampler in oversamplers.items():
    accuracy, precision, recall, f1, xtest, ytest = evaluate_oversampling(df_resampled, y_resampled, sampler=sampler, classifier=RandomForestClassifier())
    resultsRF.append({"Method": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1})

resultsRF = pd.DataFrame(resultsRF)

# Print results
print(resultsRF)

Class distribution after SMOTE: Counter({0: 4344, 1: 4344})
Class distribution after SMOTE: Counter({0: 4344, 1: 4344})
Class distribution after SMOTE: Counter({0: 4344, 1: 4291})
Class distribution after SMOTE: Counter({0: 4344, 1: 4344})
Class distribution after SMOTE: Counter({0: 4344, 1: 4344})
              Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.966835   0.966570  0.966835  0.966629
1              SMOTE  0.962509   0.962284  0.962509  0.962371
2             ADASYN  0.963230   0.963108  0.963230  0.963163
3    BorderlineSMOTE  0.963951   0.963687  0.963951  0.963773
4           SVMSMOTE  0.965393   0.965144  0.965393  0.965222


_______________________________________________________________________________________________________________________________________________________________________

In [12]:
course[course['CourseCompletion']==1].corr().abs().sum().sort_values()

CourseCompletion         0.000000
DeviceType               1.069298
TimeSpentOnCourse        1.088452
NumberOfVideosWatched    1.204505
NumberOfQuizzesTaken     1.216586
QuizScores               1.255973
CompletionRate           1.364719
dtype: float64

In [13]:
df_resampled2[df_resampled2['CourseCompletion']==1].count()

TimeSpentOnCourse        1500
NumberOfVideosWatched    1500
NumberOfQuizzesTaken     1500
QuizScores               1500
CompletionRate           1500
DeviceType               1500
CourseCompletion         1500
dtype: int64

In [14]:
4344-1500

2844

In [17]:
minority_samples = df_resampled2[df_resampled2['CourseCompletion']==1]
random_values = minority_samples['CompletionRate'].sample(n=2844, replace=True)
random_values = random_values.reset_index()
random_values = pd.DataFrame(random_values)
del random_values['index']
random_values

Unnamed: 0,CompletionRate
0,76.256748
1,55.924172
2,74.448127
3,13.279569
4,33.761368
...,...
2839,4.972913
2840,75.001118
2841,98.910439
2842,29.001884


In [18]:
course.columns

Index(['TimeSpentOnCourse', 'NumberOfVideosWatched', 'NumberOfQuizzesTaken',
       'QuizScores', 'CompletionRate', 'DeviceType', 'CourseCompletion'],
      dtype='object')

In [None]:
random_values['TimeSpentOnCourse'] = np.nan
random_values['NumberOfVideosWatched'] = np.nan
random_values['NumberOfQuizzesTaken'] = np.nan
random_values['QuizScores'] = np.nan
random_values['DeviceType'] = np.nan
random_values['CourseCompletion'] = 1

In [22]:
random_values

Unnamed: 0,CompletionRate,TimeSpentOnCourse,NumberOfVideosWatched,NumberOfQuizzesTaken,QuizScores,DeviceType,CourseCompletion
0,76.256748,,,,,,1
1,55.924172,,,,,,1
2,74.448127,,,,,,1
3,13.279569,,,,,,1
4,33.761368,,,,,,1
...,...,...,...,...,...,...,...
2839,4.972913,,,,,,1
2840,75.001118,,,,,,1
2841,98.910439,,,,,,1
2842,29.001884,,,,,,1


In [23]:
def calculate_percentiles(nums):
    indexed_nums = [(num, i) for i, num in enumerate(nums)]
    sorted_nums = []
    for num_index in indexed_nums:
        inserted = False
        for i, sorted_num_index in enumerate(sorted_nums):
            if num_index[0] < sorted_num_index[0]:
                sorted_nums.insert(i, num_index)
                inserted = True
                break
        if not inserted:
            sorted_nums.append(num_index)
    length = len(sorted_nums)
    percentiles = [0] * length
    for i, num_index in enumerate(sorted_nums):
        original_index = num_index[1]
        percentile = ((i + 1) / length) * 100
        percentiles[original_index] = percentile
    return percentiles

def dataframe_to_percentiles(df):
    df_percentiles = df.copy()
    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            df_percentiles[column] = calculate_percentiles(df[column].tolist())
    return df_percentiles


In [24]:
random_values['CompletionRate'] = calculate_percentiles(random_values['CompletionRate'])
random_values['CompletionRate'] = 'P' + random_values['CompletionRate'].round().astype(str,errors='ignore')

In [27]:
random_values['CompletionRate'].unique()

array(['P59.0', 'P34.0', 'P57.0', 'P9.0', 'P20.0', 'P5.0', 'P72.0',
       'P24.0', 'P6.0', 'P84.0', 'P27.0', 'P1.0', 'P12.0', 'P64.0',
       'P71.0', 'P86.0', 'P93.0', 'P11.0', 'P40.0', 'P76.0', 'P95.0',
       'P56.0', 'P28.0', 'P85.0', 'P87.0', 'P30.0', 'P18.0', 'P29.0',
       'P67.0', 'P50.0', 'P55.0', 'P46.0', 'P94.0', 'P7.0', 'P70.0',
       'P51.0', 'P89.0', 'P21.0', 'P77.0', 'P0.0', 'P33.0', 'P81.0',
       'P52.0', 'P44.0', 'P25.0', 'P36.0', 'P65.0', 'P43.0', 'P4.0',
       'P41.0', 'P60.0', 'P61.0', 'P63.0', 'P37.0', 'P54.0', 'P10.0',
       'P32.0', 'P2.0', 'P38.0', 'P22.0', 'P69.0', 'P75.0', 'P19.0',
       'P39.0', 'P97.0', 'P13.0', 'P3.0', 'P45.0', 'P66.0', 'P26.0',
       'P16.0', 'P17.0', 'P88.0', 'P73.0', 'P42.0', 'P8.0', 'P14.0',
       'P92.0', 'P90.0', 'P78.0', 'P99.0', 'P53.0', 'P48.0', 'P100.0',
       'P79.0', 'P98.0', 'P80.0', 'P62.0', 'P82.0', 'P68.0', 'P58.0',
       'P23.0', 'P35.0', 'P91.0', 'P96.0', 'P74.0', 'P15.0', 'P83.0',
       'P47.0', 'P49.0', 'P31

In [28]:
courseP = dataframe_to_percentiles(minority_samples)
courseP = 'P' + courseP.round().astype(str,errors='ignore')
courseP['CourseCompletion'] = 1

In [29]:
ZERO = pd.concat([courseP,random_values], ignore_index=True)
ZERO['CourseCompletion']=1
ZERO

Unnamed: 0,TimeSpentOnCourse,NumberOfVideosWatched,NumberOfQuizzesTaken,QuizScores,CompletionRate,DeviceType,CourseCompletion
0,P29.0,P47.0,P52.0,P23.0,P83.0,P0.0,1
1,P42.0,P5.0,P26.0,P61.0,P59.0,P0.0,1
2,P56.0,P3.0,P8.0,P17.0,P2.0,P50.0,1
3,P73.0,P41.0,P40.0,P66.0,P88.0,P50.0,1
4,P24.0,P17.0,P15.0,P59.0,P25.0,P50.0,1
...,...,...,...,...,...,...,...
4339,,,,,P3.0,,1
4340,,,,,P58.0,,1
4341,,,,,P98.0,,1
4342,,,,,P17.0,,1


In [30]:


# Step 1: Identify unique values of 'CompletionRate'
unique_size = courseP['CompletionRate'].unique()

# Step 2: Create a dictionary to store non-missing values for each variable
# Initialize the dictionary
imputation_info = {}

# Iterate over each variable (excluding 'pH') that has missing values
for column in courseP.columns:
    if column != 'CompletionRate' and ZERO[column].isna().sum() > 0:
        imputation_info[column] = {}
        
        # Iterate over each unique value of 'CompletionRate'
        for size_value in unique_size:
            # Get the non-missing values of the variable where 'CompletionRate' is equal to the current size_value
            non_missing_values = courseP.loc[courseP['CompletionRate'] == size_value, column].dropna().values
            imputation_info[column][size_value] = non_missing_values

In [31]:
# import random
import statistics
def fill_missing_values2(row, imputation_info):
    # For each column, check if it has a missing value
    for column in imputation_info:
        if pd.isna(row[column]):
            size_value = row['CompletionRate']  # Get the corresponding pH value for the row
            if size_value in imputation_info[column]:
                possible_values = imputation_info[column][size_value]
                if len(possible_values) > 0:
                    row[column ] = statistics.mode(possible_values)
                    # row[column] = random.choice(possible_values)

    return row

myMode = ZERO.apply(lambda row: fill_missing_values2(row, imputation_info), axis=1)

In [32]:
myMode.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4344 entries, 0 to 4343
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   TimeSpentOnCourse      4344 non-null   object
 1   NumberOfVideosWatched  4344 non-null   object
 2   NumberOfQuizzesTaken   4344 non-null   object
 3   QuizScores             4344 non-null   object
 4   CompletionRate         4344 non-null   object
 5   DeviceType             4344 non-null   object
 6   CourseCompletion       4344 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 237.7+ KB


In [33]:
# myMode[myMode.isnull().any(axis=1)]['CompletionRate'].value_counts()
# myMode = myMode.fillna(myMode.iloc[0])

In [34]:
print(myMode.shape)
print(ZERO.shape)

(4344, 7)
(4344, 7)


In [35]:
percentiles = [*range(0,101, 1)]
mapping_data = {}

columns_to_impute = course.columns.difference(['CourseCompletion'])
for variable in columns_to_impute:
    mapping_data[variable] = [course[course['CourseCompletion']==1][variable].quantile(p / 100) for p in percentiles]

# Create the mapping DataFrame
mapping_df = pd.DataFrame(mapping_data, index=[f'P{p}' for p in percentiles])
print("Mapping DataFrame:")
mapping_df

Mapping DataFrame:


Unnamed: 0,CompletionRate,DeviceType,NumberOfQuizzesTaken,NumberOfVideosWatched,QuizScores,TimeSpentOnCourse
P0,0.176793,0.0,0.0,0.0,50.007288,1.032686
P1,1.845216,0.0,0.0,0.0,51.375517,3.563946
P2,3.606125,0.0,0.0,1.0,52.404881,6.073786
P3,5.259849,0.0,0.0,1.0,53.242838,8.337925
P4,6.610693,0.0,1.0,2.0,54.090876,11.345386
...,...,...,...,...,...,...
P96,97.658563,1.0,10.0,20.0,98.554861,96.438738
P97,98.213097,1.0,10.0,20.0,98.938967,97.400302
P98,98.729335,1.0,10.0,20.0,99.407359,98.263233
P99,99.459222,1.0,10.0,20.0,99.661197,99.152242


In [36]:
def convert_percentiles_to_values(mdf, mapping_df):
    columns_to_impute = mdf.columns.difference(['CourseCompletion'])

    for column in columns_to_impute:
        mdf[column] = mdf[column].apply(lambda x: mapping_df.loc[x, column] if isinstance(x, str) and x.startswith('P') else x)
    return mdf

In [37]:
def path_to_revert(now, then):
    now = now.where(then.isna(), then)
    now = now.replace(r'^(P\d+)\.0$', r'\1', regex=True)
    return now

In [38]:
a = path_to_revert(myMode, ZERO)
a = convert_percentiles_to_values(a, mapping_df)

In [36]:
# a[0:1500] = df_resampled2[df_resampled2['CourseCompletion']==1]

In [39]:
a

Unnamed: 0,TimeSpentOnCourse,NumberOfVideosWatched,NumberOfQuizzesTaken,QuizScores,CompletionRate,DeviceType,CourseCompletion
0,39.404238,11.0,7.0,71.380533,89.856362,0.0,1
1,50.232903,2.0,4.0,85.181023,74.852380,0.0,1
2,61.718644,1.0,2.0,67.498573,3.606125,1.0,1
3,76.690642,10.0,6.0,87.201068,92.810638,1.0,1
4,34.835404,6.0,4.0,84.464761,41.439867,1.0,1
...,...,...,...,...,...,...,...
4339,48.182708,14.0,4.0,81.719490,5.259849,1.0,1
4340,68.955003,9.0,4.0,69.703763,74.235811,0.0,1
4341,99.152242,18.0,3.0,51.375517,98.729335,0.0,1
4342,95.294192,9.0,9.0,96.108003,28.042296,0.0,1


In [40]:
percent = pd.concat([a, df_resampled2[df_resampled2['CourseCompletion']==0][0:4344]], ignore_index=True)

In [41]:
percent.shape

(8688, 7)

In [42]:
def evaluate_oversampling2(X, y, classifier):

    # Train-test split on resampled data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the classifier
    classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = classifier.predict(xtest)
    
    # Evaluate metrics
    accuracy = accuracy_score(ytest, y_pred)
    precision = precision_score(ytest, y_pred, average='weighted')
    recall = recall_score(ytest, y_pred, average='weighted')
    f1 = f1_score(ytest, y_pred, average='weighted')
    
    return accuracy, precision, recall, f1

In [43]:
classifiers = {
    "GaussianNaiveBayes": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "RandomForest": RandomForestClassifier(),
}

Percentile_Results = []

for name, classifier in classifiers.items():
    accuracy, precision, recall, f1 = evaluate_oversampling2(
        percent[['TimeSpentOnCourse', 'NumberOfVideosWatched', 'NumberOfQuizzesTaken',
       'QuizScores', 'CompletionRate', 'DeviceType']] , percent[['CourseCompletion']],
        classifier
    )
    Percentile_Results.append({
        "Classifier": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })

Percentile_Results = pd.DataFrame(Percentile_Results)

  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  return fit_method(estimator, *args, **kwargs)


In [44]:
print('Percentile\n',Percentile_Results,'\n')
print('GaussianNaiveBayes\n',resultsNB,'\n')
print('KNN\n',resultsKNN,'\n')
print('RandomForest\n',resultsRF)

# Percentile
#             Classifier  Accuracy  Precision    Recall  F1 Score
# 0  GaussianNaiveBayes  0.857246   0.853137  0.857246  0.839391
# 1                 KNN  0.866619   0.861975  0.866619  0.853214
# 2        RandomForest  0.870224   0.882523  0.870224  0.849152 

# GaussianNaiveBayes
#                Method  Accuracy  Precision    Recall  F1 Score
# 0  RandomOverSampler  0.821197   0.856061  0.821197  0.831371
# 1              SMOTE  0.840663   0.858709  0.840663  0.846829
# 2             ADASYN  0.815429   0.849465  0.815429  0.825702
# 3    BorderlineSMOTE  0.823360   0.853241  0.823360  0.832559
# 4           SVMSMOTE  0.820476   0.856386  0.820476  0.830856 

# KNN
#                Method  Accuracy  Precision    Recall  F1 Score
# 0  RandomOverSampler  0.793079   0.840997  0.793079  0.806654
# 1              SMOTE  0.813266   0.851115  0.813266  0.824286
# 2             ADASYN  0.785869   0.845537  0.785869  0.801397
# 3    BorderlineSMOTE  0.810382   0.851874  0.810382  0.822116
# 4           SVMSMOTE  0.815429   0.854102  0.815429  0.826484 

# RandomForest
#                Method  Accuracy  Precision    Recall  F1 Score
# 0  RandomOverSampler  0.966835   0.966600  0.966835  0.966671
# 1              SMOTE  0.963230   0.963040  0.963230  0.963117
# 2             ADASYN  0.961788   0.961661  0.961788  0.961718
# 3    BorderlineSMOTE  0.965393   0.965109  0.965393  0.965178
# 4           SVMSMOTE  0.963230   0.963040  0.963230  0.963117

Percentile
            Classifier  Accuracy  Precision    Recall  F1 Score
0  GaussianNaiveBayes  0.812545   0.849453  0.812545  0.823440
1                 KNN  0.883922   0.882238  0.883922  0.882977
2        RandomForest  0.972603   0.972515  0.972603  0.972253 

GaussianNaiveBayes
               Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.820476   0.853129  0.820476  0.830297
1              SMOTE  0.839221   0.859021  0.839221  0.845831
2             ADASYN  0.813987   0.847492  0.813987  0.824222
3    BorderlineSMOTE  0.819755   0.849607  0.819755  0.829081
4           SVMSMOTE  0.818313   0.858095  0.818313  0.829403 

KNN
               Method  Accuracy  Precision    Recall  F1 Score
0  RandomOverSampler  0.797404   0.840018  0.797404  0.809941
1              SMOTE  0.809661   0.850186  0.809661  0.821279
2             ADASYN  0.779380   0.844669  0.779380  0.795945
3    BorderlineSMOTE  0.806056   0.848613  0.806056  0.818165
4           SVMSMOTE  0.8