In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
import os

## Define "Adjusted R^2" function
The following adjusted R^2 is based on this formula to adjust for different number of predictors.
![image.png](attachment:image.png)

where N is the population size and k is the number of predictors.

In [2]:
def adjusted_r2(r2, N, k):
    return (1 - (1 - r2) * ((N - 1) / (N - k - 1)))

## Student ID One-Hot Encoding, Model Training, and Model Evaluation

In [3]:
data = pd.read_csv("allStudentProgramsDataWith0Score.csv")

In [4]:
data = data.drop(data.columns[0], axis=1)

In [5]:
data

Unnamed: 0,Year,Semester,Quiz #,Student ID,Coding Problem,Score,Maximum,Distance,Percent,Distance Z-score,Distance Min-Max Scaled
0,2017,fall,quiz06,s159c1ea3,AllCharsExcept,3.0,5,44.719697,0.334183,-0.331719,0.089071
1,2017,fall,quiz06,s1a2bc1e2,AllCharsExcept,5.0,5,37.507576,0.280288,-0.688284,0.015336
2,2017,fall,quiz06,s1a748834,AllCharsExcept,5.0,5,37.681818,0.281590,-0.679670,0.017117
3,2017,fall,quiz06,s204527a1,AllCharsExcept,5.0,5,48.295455,0.360904,-0.154934,0.125629
4,2017,fall,quiz06,s20ffbd50,AllCharsExcept,5.0,5,133.818182,1.000000,4.073288,1.000000
...,...,...,...,...,...,...,...,...,...,...,...
19279,2018,fall,quiz11,sf1f60f2f,WriteAndGrade,50.0,50,200.698745,0.335913,-0.738330,0.048055
19280,2018,fall,quiz11,sf39b2f6f,WriteAndGrade,50.0,50,221.213389,0.370248,-0.504492,0.097274
19281,2018,fall,quiz11,sf80872c6,WriteAndGrade,49.0,50,357.129707,0.597734,1.044761,0.423366
19282,2018,fall,quiz11,sfce0219,WriteAndGrade,37.0,50,209.790795,0.351130,-0.634693,0.069868


In [6]:
student_encoder = OneHotEncoder()

In [7]:
student_encoder_df = pd.DataFrame(student_encoder.fit_transform(data[['Student ID']]).toarray())
data = data.sort_values(by=['Student ID'])
final_student_data = data.join(student_encoder_df)

In [8]:
final_student_data

Unnamed: 0,Year,Semester,Quiz #,Student ID,Coding Problem,Score,Maximum,Distance,Percent,Distance Z-score,...,857,858,859,860,861,862,863,864,865,866
7619,2018,fall,quiz04,s1041b5c9,MovieRating,3.0,5,49.690789,0.514755,-0.354310,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14227,2018,fall,quiz03,s1041b5c9,ShrinkingHexagons,10.0,10,44.625850,0.224812,-0.489107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2582,2018,fall,quiz10,s1041b5c9,DiceGame,0.0,25,111.008403,0.437794,0.434700,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4654,2018,fall,quiz03,s1041b5c9,GrowingStairs,10.0,10,78.805907,0.612662,0.247331,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10909,2018,fall,quiz09,s1041b5c9,RecursiveHasChar,0.0,10,48.960317,0.461751,0.485357,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7901,2017,fall,quiz06,sffae1858,NiceVacation,0.0,5,90.771242,0.353187,0.505440,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
16061,2017,fall,quiz01,sffae1858,SquareDiag1,10.0,10,80.565217,0.179280,-0.696855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6260,2017,fall,quiz06,sffae1858,LeastShortest2,3.0,5,5.395604,0.111401,-0.074835,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8404,2017,fall,quiz06,sffae1858,PickAndCount,5.0,5,50.854545,0.593213,0.299089,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
final_student_data2 = final_student_data.drop(final_student_data.iloc[:,0:9], axis=1)

In [10]:
final_student_data2 = final_student_data2.drop(["Distance Min-Max Scaled"], axis=1)

In [11]:
final_student_data2

Unnamed: 0,Distance Z-score,0,1,2,3,4,5,6,7,8,...,857,858,859,860,861,862,863,864,865,866
7619,-0.354310,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14227,-0.489107,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2582,0.434700,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4654,0.247331,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10909,0.485357,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7901,0.505440,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
16061,-0.696855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6260,-0.074835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8404,0.299089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
#final_student_data2.to_csv("student_model_data.csv", index=False)

In [13]:
X1 = final_student_data.iloc[:, 11:]
y1 = final_student_data.loc[:, ["Distance Z-score"]]
student_model = LinearRegression().fit(X1, y1)
student_model.coef_

array([[-5.10531127e+11, -5.10531127e+11, -5.10531127e+11,
        -5.10531127e+11, -5.10531127e+11, -5.10531127e+11,
        -5.10531127e+11, -5.10531127e+11, -5.10531127e+11,
        -5.10531127e+11, -5.10531127e+11, -5.10531127e+11,
        -5.10531127e+11, -5.10531127e+11, -5.10531127e+11,
        -5.10531127e+11, -5.10531127e+11, -5.10531127e+11,
        -5.10531127e+11, -5.10531127e+11, -5.10531127e+11,
        -5.10531127e+11, -5.10531127e+11, -5.10531127e+11,
        -5.10531127e+11, -5.10531127e+11, -5.10531127e+11,
        -5.10531127e+11, -5.10531127e+11, -5.10531127e+11,
        -5.10531127e+11, -5.10531127e+11, -5.10531127e+11,
        -5.10531127e+11, -5.10531127e+11, -5.10531127e+11,
        -5.10531127e+11, -5.10531127e+11, -5.10531127e+11,
        -5.10531127e+11, -5.10531127e+11, -5.10531127e+11,
        -5.10531127e+11, -5.10531127e+11, -5.10531127e+11,
        -5.10531127e+11, -5.10531127e+11, -5.10531127e+11,
        -5.10531127e+11, -5.10531127e+11, -5.10531127e+1

In [15]:
student_model.intercept_

array([5.10531127e+11])

In [16]:
student_model.score(X1, y1)

0.29252426787191366

In [17]:
adjusted_r2(student_model.score(X1, y1), X1.shape[0], X1.shape[1])

0.25921728156896773

## Coding Problem One-Hot Encoding, Model Training, and Model Evaluation

In [18]:
problem_data = pd.read_csv("allStudentProgramsDataWith0Score.csv")

In [19]:
problem_data = problem_data.drop(problem_data.columns[0], axis=1)

In [20]:
problem_data

Unnamed: 0,Year,Semester,Quiz #,Student ID,Coding Problem,Score,Maximum,Distance,Percent,Distance Z-score,Distance Min-Max Scaled
0,2017,fall,quiz06,s159c1ea3,AllCharsExcept,3.0,5,44.719697,0.334183,-0.331719,0.089071
1,2017,fall,quiz06,s1a2bc1e2,AllCharsExcept,5.0,5,37.507576,0.280288,-0.688284,0.015336
2,2017,fall,quiz06,s1a748834,AllCharsExcept,5.0,5,37.681818,0.281590,-0.679670,0.017117
3,2017,fall,quiz06,s204527a1,AllCharsExcept,5.0,5,48.295455,0.360904,-0.154934,0.125629
4,2017,fall,quiz06,s20ffbd50,AllCharsExcept,5.0,5,133.818182,1.000000,4.073288,1.000000
...,...,...,...,...,...,...,...,...,...,...,...
19279,2018,fall,quiz11,sf1f60f2f,WriteAndGrade,50.0,50,200.698745,0.335913,-0.738330,0.048055
19280,2018,fall,quiz11,sf39b2f6f,WriteAndGrade,50.0,50,221.213389,0.370248,-0.504492,0.097274
19281,2018,fall,quiz11,sf80872c6,WriteAndGrade,49.0,50,357.129707,0.597734,1.044761,0.423366
19282,2018,fall,quiz11,sfce0219,WriteAndGrade,37.0,50,209.790795,0.351130,-0.634693,0.069868


In [21]:
problem_encoder = OneHotEncoder()
problem_encoder_df = pd.DataFrame(problem_encoder.fit_transform(problem_data[['Coding Problem']]).toarray())
problem_data = problem_data.sort_values(by=['Coding Problem'])
final_problem_data = problem_data.join(problem_encoder_df)

In [22]:
final_problem_data2 = final_problem_data.drop(final_problem_data.iloc[:,0:9], axis=1)
final_problem_data2 = final_problem_data2.drop(["Distance Min-Max Scaled"], axis=1)
final_problem_data2

Unnamed: 0,Distance Z-score,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100
0,-0.331719,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,-0.620117,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,-0.537343,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,-0.647834,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,-0.654950,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19127,3.176534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19128,-0.890136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19129,-0.452602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19162,-0.585570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [23]:
#final_problem_data2.to_csv("coding_problem_model_data.csv", index=False)

In [24]:
X2 = final_problem_data.iloc[:, 11:]
y2 = final_problem_data.loc[:, ["Distance Z-score"]]
problem_model = LinearRegression().fit(X2, y2)
problem_model.coef_

array([[-1.92188159e+11, -1.92188159e+11, -1.92188159e+11,
        -1.92188159e+11, -1.92188159e+11, -1.92188159e+11,
        -1.92188159e+11, -1.92188159e+11, -1.92188159e+11,
        -1.92188159e+11, -1.92188159e+11, -1.92188159e+11,
        -1.92188159e+11, -1.92188159e+11, -1.92188159e+11,
        -1.92188159e+11, -1.92188159e+11, -1.92188159e+11,
        -1.92188159e+11, -1.92188159e+11, -1.92188159e+11,
        -1.92188159e+11, -1.92188159e+11, -1.92188159e+11,
        -1.92188159e+11, -1.92188159e+11, -1.92188159e+11,
        -1.92188159e+11, -1.92188159e+11, -1.92188159e+11,
        -1.92188159e+11, -1.92188159e+11, -1.92188159e+11,
        -1.92188159e+11, -1.92188159e+11, -1.92188159e+11,
        -1.92188159e+11, -1.92188159e+11, -1.92188159e+11,
        -1.92188159e+11, -1.92188159e+11, -1.92188159e+11,
        -1.92188159e+11, -1.92188159e+11, -1.92188159e+11,
        -1.92188159e+11, -1.92188159e+11, -1.92188159e+11,
        -1.92188159e+11, -1.92188159e+11, -1.92188159e+1

In [26]:
problem_model.intercept_

array([1.92188159e+11])

In [27]:
problem_model.score(X2, y2)

-2.045648055393201e-06

In [28]:
adjusted_r2(problem_model.score(X2, y2), X2.shape[0], X2.shape[1])

-0.005267409354157593

## Calculating BIC Values for Models

In [30]:
X1 = sm.add_constant(X1)
student = sm.OLS(y1, X1).fit()
print(student.bic)

56606.72465370617


In [31]:
X2 = sm.add_constant(X2)
problem = sm.OLS(y2, X2).fit()
print(problem.bic)

55722.1914812059
