In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, classification_report

raw_df = pd.read_csv('./data/merged.csv')
print(f"Data rows & cols size: {raw_df.shape}")
raw_df.head()

Data rows & cols size: (188, 15)


Unnamed: 0.1,Unnamed: 0,Timestamp,I come to lecture:,I've had prior machine learning / data science experience,Which section are you in?,"About how long, in hours, did you study for exam 1?",What year are you?,Did you do the readings?,How many hours a day on average do you spend on sites with infinite scroll?,How many hours of sleep did you get the night before the exam?,Did you leave the exam early?,What grade do you think you got?,I wanted the extra credit but just put down random responses (you'll still get the extra credit if you say yes),Total Score,Max Points
0,0,10/19/2025 11:23:58,Always,,,9.0,Junior,Some of them,6.0,7.0,Yes,D,No,42.3,65.0
1,2,10/19/2025 11:24:24,Always,,,6.0,Junior,Some of them,0.0,5.0,No,D,Yes,21.4,65.0
2,3,10/19/2025 11:24:27,Always,,,5.0,Junior,Some of them,2.0,7.0,No,B,No,51.5,65.0
3,4,10/19/2025 11:24:47,Always,,,6.0,Junior,All of them,0.0,8.0,No,B,No,52.4,65.0
4,5,10/19/2025 11:24:50,Always,Quite a bit,,3.0,Junior,Some of them,3.0,3.0,No,C,No,49.5,65.0


In [4]:
# Exploratory Data Analysis
print("Columns:\n", list(raw_df.columns))
print("Datatypes:\n", raw_df.dtypes)
print("Numeric Columns:")
print("----------------")
display(raw_df.describe(include=[np.number]).T)
print("Categorical Columns:")
print("--------------------")
cat_cols = [c for c in raw_df.columns if raw_df[c].dtype == 'object']
for c in cat_cols[:10]:
    print(f"\nValue counts: {c}")
    display(raw_df[c].value_counts(dropna=False).head(10))

Columns:
 ['Unnamed: 0', 'Timestamp', 'I come to lecture:', "I've had prior machine learning / data science experience", 'Which section are you in?', 'About how long, in hours, did you study for exam 1?', 'What year are you?', 'Did you do the readings?', 'How many hours a day on average do you spend on sites with infinite scroll?', 'How many hours of sleep did you get the night before the exam?', 'Did you leave the exam early?', 'What grade do you think you got?', "I wanted the extra credit but just put down random responses (you'll still get the extra credit if you say yes)", 'Total Score', 'Max Points']
Datatypes:
 Unnamed: 0                                                                                                           int64
Timestamp                                                                                                           object
I come to lecture:                                                                                                  object
I've h

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,188.0,96.478723,56.492246,0.0,47.75,94.5,146.25,193.0
"About how long, in hours, did you study for exam 1?",188.0,9.511702,7.525825,1.0,5.0,8.0,10.0,48.0
How many hours a day on average do you spend on sites with infinite scroll?,188.0,2.562766,1.77358,0.0,1.5,2.0,3.125,11.0
How many hours of sleep did you get the night before the exam?,188.0,6.203723,1.90915,0.0,5.0,7.0,8.0,10.0
Total Score,188.0,42.426064,10.026964,4.7,35.875,42.85,49.925,63.9
Max Points,188.0,65.0,0.0,65.0,65.0,65.0,65.0,65.0


Categorical Columns:
--------------------

Value counts: Timestamp


Timestamp
10/19/2025 11:23:58    1
10/19/2025 11:24:24    1
10/19/2025 11:24:27    1
10/19/2025 11:24:47    1
10/19/2025 11:24:50    1
10/19/2025 11:25:28    1
10/19/2025 11:25:37    1
10/19/2025 11:25:52    1
10/19/2025 11:26:19    1
10/19/2025 11:26:39    1
Name: count, dtype: int64


Value counts: I come to lecture:


I come to lecture:
Always                     105
More than half the time     54
Less than half the time     19
Basically never             10
Name: count, dtype: int64


Value counts: I've had prior machine learning / data science experience


I've had prior machine learning / data science experience
Some           90
NaN            75
Quite a bit    23
Name: count, dtype: int64


Value counts: Which section are you in?


Which section are you in?
2:00pm    95
9:30am    56
NaN       37
Name: count, dtype: int64


Value counts: What year are you?


What year are you?
Junior              129
Senior               30
Sophomore            20
It's complicated      9
Name: count, dtype: int64


Value counts: Did you do the readings?


Did you do the readings?
Some of them               95
All of them                78
None of them               12
I don't want to respond     3
Name: count, dtype: int64


Value counts: Did you leave the exam early?


Did you leave the exam early?
No     152
Yes     36
Name: count, dtype: int64


Value counts: What grade do you think you got?


What grade do you think you got?
B                    92
C                    50
D                    16
A                    16
Prefer not to say    11
F                     3
Name: count, dtype: int64


Value counts: I wanted the extra credit but just put down random responses (you'll still get the extra credit if you say yes)


I wanted the extra credit but just put down random responses (you'll still get the extra credit if you say yes)
No     177
Yes     11
Name: count, dtype: int64

In [5]:
#Data Cleaning - Dropping irrelevant columns and filling in the NAs (dtypes are already correct) and switching Yes/No categories to 1/0 for some categorical models
df = raw_df.copy()
for col in ['Unnamed: 0', 'Timestamp']:
    if col in df.columns:
        df = df.drop(columns=[col])
df['Did you leave the exam early?'] = df['Did you leave the exam early?'].replace('Yes', 1.0)
df['Did you leave the exam early?'] = df['Did you leave the exam early?'].replace('No', 0.0)
df["I wanted the extra credit but just put down random responses (you'll still get the extra credit if you say yes)"] = df["I wanted the extra credit but just put down random responses (you'll still get the extra credit if you say yes)"].replace('Yes', 1.0)
df["I wanted the extra credit but just put down random responses (you'll still get the extra credit if you say yes)"] = df["I wanted the extra credit but just put down random responses (you'll still get the extra credit if you say yes)"].replace('No', 0.0)
#Looking at the dataset, the data isn't missing for this column, but instead actually the value "None" that got misinterpreted
df["I've had prior machine learning / data science experience"] = df["I've had prior machine learning / data science experience"].fillna("None")
#Creating new columns
df["is_missing_section"] = df["Which section are you in?"].isna()
df.head()

  df['Did you leave the exam early?'] = df['Did you leave the exam early?'].replace('No', 0.0)
  df["I wanted the extra credit but just put down random responses (you'll still get the extra credit if you say yes)"] = df["I wanted the extra credit but just put down random responses (you'll still get the extra credit if you say yes)"].replace('No', 0.0)


Unnamed: 0,I come to lecture:,I've had prior machine learning / data science experience,Which section are you in?,"About how long, in hours, did you study for exam 1?",What year are you?,Did you do the readings?,How many hours a day on average do you spend on sites with infinite scroll?,How many hours of sleep did you get the night before the exam?,Did you leave the exam early?,What grade do you think you got?,I wanted the extra credit but just put down random responses (you'll still get the extra credit if you say yes),Total Score,Max Points,is_missing_section
0,Always,,,9.0,Junior,Some of them,6.0,7.0,1.0,D,0.0,42.3,65.0,True
1,Always,,,6.0,Junior,Some of them,0.0,5.0,0.0,D,1.0,21.4,65.0,True
2,Always,,,5.0,Junior,Some of them,2.0,7.0,0.0,B,0.0,51.5,65.0,True
3,Always,,,6.0,Junior,All of them,0.0,8.0,0.0,B,0.0,52.4,65.0,True
4,Always,Quite a bit,,3.0,Junior,Some of them,3.0,3.0,0.0,C,0.0,49.5,65.0,True


### PART 1

In [6]:
#Training and Testing Splits (and binning each grade by passing/failing)
X = df.drop(['Total Score', 'Max Points', "I wanted the extra credit but just put down random responses (you'll still get the extra credit if you say yes)", "is_missing_section", "Which section are you in?"], axis=1)
X = pd.get_dummies(X, columns=["I come to lecture:", "I've had prior machine learning / data science experience", "What year are you?", "Did you do the readings?", "What grade do you think you got?"])
y = ((df['Total Score'] / df['Max Points'] )>= 0.7).astype(int)
print(list(X.columns))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

['About how long, in hours, did you study for exam 1?', 'How many hours a day on average do you spend on sites with infinite scroll?', 'How many hours of sleep did you get the night before the exam?', 'Did you leave the exam early?', 'I come to lecture:_Always', 'I come to lecture:_Basically never', 'I come to lecture:_Less than half the time', 'I come to lecture:_More than half the time', "I've had prior machine learning / data science experience_None", "I've had prior machine learning / data science experience_Quite a bit", "I've had prior machine learning / data science experience_Some", "What year are you?_It's complicated", 'What year are you?_Junior', 'What year are you?_Senior', 'What year are you?_Sophomore', 'Did you do the readings?_All of them', "Did you do the readings?_I don't want to respond", 'Did you do the readings?_None of them', 'Did you do the readings?_Some of them', 'What grade do you think you got?_A', 'What grade do you think you got?_B', 'What grade do you thin

In [7]:
#Running KNN, RandomForestClassifier, and Ridge Classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier

#KNN
print("KNN:")
knn = KNeighborsClassifier(n_neighbors=18)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {scores}")
print(f"Mean: {scores.mean():.3f}")
print(f"Std: {scores.std():.3f}")
print("--------------------------")

#RandomForestClassifier
print("Random Forest Classifier:")
rf = RandomForestClassifier(n_estimators=20)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
scores = cross_val_score(rf, X, y, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {scores}")
print(f"Mean: {scores.mean():.3f}")
print(f"Std: {scores.std():.3f}")
print("--------------------------")

#RidgeClassifier
print("Ridge Classifier:")
ridge = RidgeClassifier(alpha=1.0)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
scores = cross_val_score(ridge, X, y, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {scores}")
print(f"Mean: {scores.mean():.3f}")
print(f"Std: {scores.std():.3f}")

KNN:
Accuracy: 0.658

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.68      0.72        25
           1       0.50      0.62      0.55        13

    accuracy                           0.66        38
   macro avg       0.64      0.65      0.64        38
weighted avg       0.68      0.66      0.66        38

Cross-validation scores: [0.76315789 0.65789474 0.55263158 0.62162162 0.54054054]
Mean: 0.627
Std: 0.081
--------------------------
Random Forest Classifier:
Accuracy: 0.632

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.68      0.71        25
           1       0.47      0.54      0.50        13

    accuracy                           0.63        38
   macro avg       0.60      0.61      0.60        38
weighted avg       0.65      0.63      0.64        38

Cross-validation scores: [0.68421053 0.71052632 0.68421053 0.64864865 0.54054054]
Mean: 0.654
Std: 0.06

### PART 2

In [8]:
#Training and Testing Splits (and binning each grade by letter grade). X STAYS THE SAME
y = ((df['Total Score'] / df['Max Points'] ).apply(lambda x: "A" if x >= 0.90 else "B" if x >= 0.80 else "C" if x >= 0.70 else "D" if x >= 0.60 else "F"))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
#Running DecisionTree, SVM, and LogisticRegressionClassifier
#CV with only 3 splits because some target classes don't have enough
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

print("Decision Tree:")
dt = DecisionTreeClassifier(criterion="entropy")
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
scores = cross_val_score(dt, X, y, cv=3, scoring='accuracy')
print(f"Cross-validation scores: {scores}")
print(f"Mean: {scores.mean():.3f}")
print(f"Std: {scores.std():.3f}")
print("--------------------------")

print("Support Vector Machine:")
svm = SVC(kernel="linear")
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
scores = cross_val_score(svm, X, y, cv=3, scoring='accuracy')
print(f"Cross-validation scores: {scores}")
print(f"Mean: {scores.mean():.3f}")
print(f"Std: {scores.std():.3f}")
print("--------------------------")

print("Logistic Regression:")
lr = LogisticRegression(max_iter=100, class_weight='balanced')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
scores = cross_val_score(lr, X, y, cv=3, scoring='accuracy')
print(f"Cross-validation scores: {scores}")
print(f"Mean: {scores.mean():.3f}")
print(f"Std: {scores.std():.3f}")

Decision Tree:
Accuracy: 0.263

Classification Report:
              precision    recall  f1-score   support

           A       0.00      0.00      0.00         1
           B       0.30      0.50      0.38         6
           C       0.11      0.17      0.13         6
           D       0.29      0.15      0.20        13
           F       0.40      0.33      0.36        12

    accuracy                           0.26        38
   macro avg       0.22      0.23      0.21        38
weighted avg       0.29      0.26      0.26        38

Cross-validation scores: [0.34920635 0.41269841 0.25806452]
Mean: 0.340
Std: 0.063
--------------------------
Support Vector Machine:
Accuracy: 0.316

Classification Report:
              precision    recall  f1-score   support

           A       0.00      0.00      0.00         1
           B       0.25      0.33      0.29         6
           C       0.25      0.50      0.33         6
           D       0.40      0.15      0.22        13
           

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

### PART 3

In [10]:
#Training and Testing Splits for their score percentages through linear regression.
y = df['Total Score'] / df['Max Points']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Running linear regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

linearregression = LinearRegression()
linearregression.fit(X_train, y_train)
y_pred = linearregression.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("MSE:", mse)
print("R^2:", r2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print()

MSE: 0.02399933044557033
R^2: -0.05766774536386654
(150, 25) (38, 25) (150,) (38,)
