In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('/content/Predictive analytics for student engagement and course completion.csv')

In [3]:
df.head()

Unnamed: 0,Id,Start time,Completion time,Email,Name,Name1,Attendence percentage,Number of Logins,Duration of session (in hrs),use of additional resourse,Assignment scores,Course feedback rating,course completed
0,1,6/21/2025 9:46,6/21/2025 9:47,23ucs164@kamarajengg.edu.in,SHALINI.S,Shalini.S,72%,2,5,yes,80.0,Good,yes
1,2,6/21/2025 9:46,6/21/2025 9:47,23ucs006@kamarajengg.edu.in,BHAVA AKSHAYA.M,Bhava Akshaya M,60%,1,5,yes,90.0,Excellent,yes
2,3,6/21/2025 9:47,6/21/2025 9:48,23ucs002@kamarajengg.edu.in,BHARATH.R,BHARATH,72%,1,3,No,80.0,Good,yes
3,4,6/21/2025 9:47,6/21/2025 9:48,23ucs076@kamarajengg.edu.in,DHANALAKSHMI.R,Dhanalakshmi,above 80%,6,4,yes,78.0,Good,yes
4,5,6/21/2025 9:48,6/21/2025 9:49,23ucs077@kamarajengg.edu.in,MADHUMITHA.M,Madhumitha.M,above 80%,2,3,yes,100.0,Excellent,yes


In [4]:
df.tail()

Unnamed: 0,Id,Start time,Completion time,Email,Name,Name1,Attendence percentage,Number of Logins,Duration of session (in hrs),use of additional resourse,Assignment scores,Course feedback rating,course completed
36,37,6/21/2025 10:27,6/21/2025 10:28,23ucs123@kamarajengg.edu.in,DEEPIKA.K,Rohan,above 80%,2,5,yes,80.0,Good,no
37,38,6/21/2025 10:29,6/21/2025 10:30,23ucs045@kamarajengg.edu.in,VISWANTHKANTHAPANDIYAN.S,viswanth,72%,10,5,yes,10.0,Good,no
38,39,6/21/2025 10:30,6/21/2025 10:30,23ucs045@kamarajengg.edu.in,VISWANTHKANTHAPANDIYAN.S,joel,72%,18,15,yes,19.0,Excellent,no
39,40,6/21/2025 10:30,6/21/2025 10:31,23ucs045@kamarajengg.edu.in,VISWANTHKANTHAPANDIYAN.S,john,60%,10,15,No,20.0,Good,no
40,41,6/21/2025 10:30,6/21/2025 10:31,23ucs167@kamarajengg.edu.in,HARISH MANI.M,HARISH MANI.M,60%,2,5,yes,70.0,Excellent,


In [5]:
df.isnull().sum()

Unnamed: 0,0
Id,0
Start time,0
Completion time,0
Email,0
Name,0
Name1,0
Attendence percentage,0
Number of Logins,0
Duration of session (in hrs),0
use of additional resourse,2


In [6]:
df.dropna(inplace = True)

In [7]:
df.isnull().sum()

Unnamed: 0,0
Id,0
Start time,0
Completion time,0
Email,0
Name,0
Name1,0
Attendence percentage,0
Number of Logins,0
Duration of session (in hrs),0
use of additional resourse,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36 entries, 0 to 39
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Id                            36 non-null     int64  
 1   Start time                    36 non-null     object 
 2   Completion time               36 non-null     object 
 3   Email                         36 non-null     object 
 4   Name                          36 non-null     object 
 5   Name1                         36 non-null     object 
 6   Attendence percentage         36 non-null     object 
 7   Number of Logins              36 non-null     int64  
 8   Duration of session (in hrs)  36 non-null     int64  
 9   use of additional resourse    36 non-null     object 
 10  Assignment scores             36 non-null     float64
 11  Course feedback rating        36 non-null     object 
 12  course completed              36 non-null     object 
dtypes: float64(1

In [9]:
if df['course completed'].dtype == 'object':
    le = LabelEncoder()
    df['course completed'] = le.fit_transform(df['course completed'])

In [10]:
set1 = ['Attendence percentage', 'Number of Logins', 'Duration of session (in hrs)']
set2 = ['Course feedback rating', 'use of additional resourse', 'Assignment scores']
label_col = 'course completed'

In [11]:
df['use of additional resourse'] = df['use of additional resourse'].map({'yes': 1, 'no': 0})

In [12]:
# Clean 'Attendence percentage' column
df['Attendence percentage'] = df['Attendence percentage'].astype(str)
df['Attendence percentage'] = df['Attendence percentage'].str.lower().str.replace('%', '')

# Handle custom textual entries
df['Attendence percentage'] = df['Attendence percentage'].replace({
    'above 80': '85',
    'below 50': '45',
    'above 90': '95',
    'below 60': '55'
})

# Convert to float
df['Attendence percentage'] = pd.to_numeric(df['Attendence percentage'], errors='coerce')


In [20]:
# View unique values first (optional)
print(df['Course feedback rating'].unique())
rating_map = {
    'Excellent': 5,
    'Very Good': 4,
    'Good': 3,
    'Average': 2,
    'Poor': 1
}

df['Course feedback rating'] = df['Course feedback rating'].map(rating_map)

['Good' 'Excellent']


In [21]:
df.dropna(inplace=True)

In [22]:
df.isnull().sum()

Unnamed: 0,0
Id,0
Start time,0
Completion time,0
Email,0
Name,0
Name1,0
Attendence percentage,0
Number of Logins,0
Duration of session (in hrs),0
use of additional resourse,0


In [23]:
labeled_df = df.sample(frac=0.3, random_state=42)  # 30% labeled
unlabeled_df = df.drop(labeled_df.index)

In [24]:
X1_labeled = labeled_df[set1].values
X2_labeled = labeled_df[set2].values
y_labeled = labeled_df[label_col].values

In [25]:
X1_unlabeled = unlabeled_df[set1].values
X2_unlabeled = unlabeled_df[set2].values
y_unlabeled_true = unlabeled_df[label_col].values

In [26]:
clf1 = RandomForestClassifier(random_state=42)
clf2 = GaussianNB()

In [27]:
clf1.fit(X1_labeled, y_labeled)
clf2.fit(X2_labeled, y_labeled)

In [28]:
for i in range(5):  # Iterate a few rounds
    # Predict on unlabeled data
    y1_pred = clf1.predict(X1_unlabeled)
    y2_pred = clf2.predict(X2_unlabeled)

    # Select confident samples (where both classifiers agree)
    agreement_idx = np.where(y1_pred == y2_pred)[0]

    if len(agreement_idx) == 0:
        break  # Stop if no agreement

    # Add agreed-upon predictions to labeled dataset
    new_X1 = X1_unlabeled[agreement_idx]
    new_X2 = X2_unlabeled[agreement_idx]
    new_y = y1_pred[agreement_idx]

    X1_labeled = np.concatenate((X1_labeled, new_X1))
    X2_labeled = np.concatenate((X2_labeled, new_X2))
    y_labeled = np.concatenate((y_labeled, new_y))

    # Remove labeled data from unlabeled
    X1_unlabeled = np.delete(X1_unlabeled, agreement_idx, axis=0)
    X2_unlabeled = np.delete(X2_unlabeled, agreement_idx, axis=0)
    y_unlabeled_true = np.delete(y_unlabeled_true, agreement_idx, axis=0)

    # Retrain classifiers
    clf1.fit(X1_labeled, y_labeled)
    clf2.fit(X2_labeled, y_labeled)

In [29]:
final_preds1 = clf1.predict(X1_unlabeled)
final_preds2 = clf2.predict(X2_unlabeled)
final_preds = (final_preds1 + final_preds2) // 2  # Majority vote

print("=== Co-Training Evaluation ===")
print(classification_report(y_unlabeled_true, final_preds))

=== Co-Training Evaluation ===
              precision    recall  f1-score   support

           0       0.75      1.00      0.86         6
           1       0.00      0.00      0.00         2

    accuracy                           0.75         8
   macro avg       0.38      0.50      0.43         8
weighted avg       0.56      0.75      0.64         8



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [32]:
import pickle

# Load from .pkl
with open("performance.pkl", "wb") as f:
    pickle.dump(df, f)