In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

In [50]:
df_LogProblem=pd.read_csv(r'/home/dorra/LogProblemP.csv')
df_InfoUser=pd.read_csv(r'/home/dorra/InfoUserP.csv')
df_InfoContent=pd.read_csv(r'/home/dorra/InfoContentP.csv')

In [7]:
df_LogProblem.columns
df_InfoContent.columns
df_InfoUser.columns


Index(['uuid', 'gender', 'points', 'badges_cnt', 'first_login_date_TW',
       'user_grade', 'user_city', 'has_teacher_cnt', 'is_self_coach',
       'has_student_cnt', 'belongs_to_class_cnt', 'has_class_cnt'],
      dtype='object')

MERGING THE THREE FEATURES INTO LOGPROBLEM FILE : 'has_teacher_cnt', 'points', 'total_sec_taken'

In [51]:
df_LogProblem = df_LogProblem.merge(df_InfoUser[['uuid', 'has_teacher_cnt', 'points']], on='uuid', how='left')
#df_LogProblem = df_LogProblem.merge(df_InfoContent[['ucid', 'difficulty']], on='ucid', how='left')


In [29]:
#df_LogProblem = pd.get_dummies(df_LogProblem, columns=['difficulty'], prefix='difficulty')

ADDING 'AAA' TO LOGPROBLEM FILE :

In [52]:

student_grouped = df_LogProblem.groupby('uuid').agg({
    'is_correct': 'sum',
    'total_attempt_cnt': 'sum',
    'has_teacher_cnt': 'max',   
    'points': 'sum',
    #'difficulty_easy': 'max',  # One-hot encoding for difficulty
    #'difficulty_normal': 'max',
    #'difficulty_hard': 'max'
})
student_grouped['AAA'] = student_grouped['is_correct'] / student_grouped['total_attempt_cnt']

student_grouped.reset_index(inplace=True)

df_LogProblem = df_LogProblem.merge(student_grouped[['uuid', 'AAA']], on='uuid', how='left')


In [53]:

x = df_LogProblem[['total_sec_taken','has_teacher_cnt','points']].values
y = df_LogProblem['AAA'].values


In [54]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [55]:
non_nan_indices = ~np.isnan(y)
x = x[non_nan_indices]
y = y[non_nan_indices]

In [56]:
batch_size = 1000
def model(x, y, learning_rate, iterations, batch_size):
    m, n = x.shape
    theta = np.zeros((n, 1))

    for iteration in range(iterations):
        # Shuffle and split the data into mini-batches
        indices = np.arange(m)
        np.random.shuffle(indices)
        for i in range(0, m, batch_size):
            batch_indices = indices[i:i + batch_size]
            x_batch = x[batch_indices]
            y_batch = y[batch_indices]
            y_batch = y_batch.reshape(-1, 1)

            gradient_threshold = 1.0

            # Compute the gradient and update theta for this mini-batch
            y_pred = np.dot(x_batch, theta)
            gradient = (1 / batch_size) * np.dot(x_batch.T, y_pred - y_batch)
            gradient = np.clip(gradient, -gradient_threshold, gradient_threshold)
            theta = theta - learning_rate * gradient

    return theta

In [57]:
learning_rate = 0.00000005
iterations = 100


In [58]:
theta = model(x, y, learning_rate, iterations, batch_size)


In [59]:
print("Learned theta:", theta)


Learned theta: [[1.01787854e-03]
 [8.50831095e-04]
 [1.06886752e-06]]


The first value corresponds to the coefficient for the total_sec_taken feature.

The second value corresponds to the coefficient for the has_teacher_cnt feature.

The third value corresponds to the coefficient for the points feature.

In [60]:
X_train, X_temp, y_train, y_temp = train_test_split(x, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [62]:
y_pred = np.dot(X_test, theta)  
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("MSE:", mse)
print("R-squared:", r2)


MAE: 0.27765737238332305
MSE: 0.12483867749192065
R-squared: -2.0351630257924342


MAE represents the average absolute error between the model's predictions and the actual values. Lower MAE values indicate better performance, and 0 would mean a perfect match.

MSE measures the average squared difference between predictions and actual values. Like MAE, lower MSE values indicate better performance. MSE is more sensitive to outliers than MAE.

R-squared (R²) is a measure of how well the model explains the variance in the data. An R² value close to 1 indicates that the model is a good fit for the data, while a value close to 0 suggests that the model is no better than predicting the mean of the target variable. A negative R² indicates that the model is performing worse than a horizontal line (predicting the mean) which is the case here.

POLYNOMIAL REGRESSION :

In [63]:
degree = 2  

poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

poly_reg = LinearRegression()
poly_reg.fit(X_train_poly, y_train)

y_pred = poly_reg.predict(X_test_poly)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("MSE:", mse)
print("R-squared:", r2)


MAE: 0.16570360778291787
MSE: 0.03916904946960977
R-squared: 0.04818024941517707


In [71]:
new_data = np.array([[5, 0, 1200000]])

new_data_poly = poly.transform(new_data)  

# Make predictions for the new AAA
new_AAA_prediction = poly_reg.predict(new_data_poly) 

print("Predicted AAA:", new_AAA_prediction[0])

Predicted AAA: 0.5149770301407934
