In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [29]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
import numpy as np

In [30]:
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DataSet_FYP/student_study_schedule_modified.csv')

In [31]:
df.head()

Unnamed: 0,ID No,Student No,Age,Sex,Stream,Subject,Difficulty Level,Previous Test Date,Previous Mark,Latest Test Date,Latest Mark,Date Count,Study Time
0,1,1,20,M,Physical science,Mathematics,3,16/03/2022,51,13/05/2022,60,58,80
1,2,1,20,M,Physical science,Physics,3,11/04/2022,38,19/04/2022,35,8,90
2,3,1,20,M,Physical science,ICT,5,10/06/2020,39,31/08/2020,39,82,160
3,4,2,22,M,Physical science,Mathematics,5,22/09/2023,20,06/10/2023,24,14,160
4,5,2,22,M,Physical science,Physics,2,07/11/2020,59,16/11/2020,51,9,70


In [32]:
df.nunique()

ID No                 303066
Student No            101022
Age                        7
Sex                        2
Stream                     3
Subject                    9
Difficulty Level           5
Previous Test Date      1461
Previous Mark             89
Latest Test Date        1550
Latest Mark              109
Date Count                90
Study Time                17
dtype: int64

In [33]:
# Check unique rows based on all columns
unique_rows = df.drop_duplicates()
print(f"Unique rows based on all columns: {len(unique_rows)}")

# Check unique rows based on a subset of columns, for example, 'Student No' and 'Subject'
unique_student_subject_combinations = df.drop_duplicates(subset=['Student No', 'Subject'])
print(f"Unique student-subject combinations: {len(unique_student_subject_combinations)}")


Unique rows based on all columns: 303066
Unique student-subject combinations: 303066


In [34]:
# manual geneder define
sex_mapping = {'M':0, 'F':1}
df['Sex'] = df['Sex'].map(sex_mapping)

In [35]:
# Assuming 'Stream' and 'Subject' as categorical features and others as numerical for simplicity
categorical_features = ['Stream', 'Subject']
# One-hot encoding for categorical features
df = pd.get_dummies(df, columns=categorical_features)

In [36]:
y = df['Study Time'].values

In [37]:
df.head(10)

Unnamed: 0,ID No,Student No,Age,Sex,Difficulty Level,Previous Test Date,Previous Mark,Latest Test Date,Latest Mark,Date Count,...,Stream_Physical science,Subject_Accounting,Subject_Agriculture,Subject_Biology,Subject_Business Studies,Subject_Chemistry,Subject_Economics,Subject_ICT,Subject_Mathematics,Subject_Physics
0,1,1,20,0,3,16/03/2022,51,13/05/2022,60,58,...,True,False,False,False,False,False,False,False,True,False
1,2,1,20,0,3,11/04/2022,38,19/04/2022,35,8,...,True,False,False,False,False,False,False,False,False,True
2,3,1,20,0,5,10/06/2020,39,31/08/2020,39,82,...,True,False,False,False,False,False,False,True,False,False
3,4,2,22,0,5,22/09/2023,20,06/10/2023,24,14,...,True,False,False,False,False,False,False,False,True,False
4,5,2,22,0,2,07/11/2020,59,16/11/2020,51,9,...,True,False,False,False,False,False,False,False,False,True
5,6,2,22,0,2,19/03/2020,55,17/05/2020,57,59,...,True,False,False,False,False,False,False,True,False,False
6,7,3,18,1,5,15/09/2020,61,23/11/2020,71,69,...,False,False,False,True,False,False,False,False,False,False
7,8,3,18,1,5,09/12/2022,69,27/12/2022,60,18,...,False,False,False,False,False,False,False,False,False,True
8,9,3,18,1,4,13/01/2023,41,20/03/2023,33,66,...,False,False,False,False,False,True,False,False,False,False
9,10,4,22,1,2,15/09/2020,76,11/11/2020,74,57,...,True,False,False,False,False,False,False,False,True,False


In [38]:
# Normalize numerical variables
numerical_features = ['Age', 'Difficulty Level', 'Previous Mark', 'Date Count']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [39]:
df.head()

Unnamed: 0,ID No,Student No,Age,Sex,Difficulty Level,Previous Test Date,Previous Mark,Latest Test Date,Latest Mark,Date Count,...,Stream_Physical science,Subject_Accounting,Subject_Agriculture,Subject_Biology,Subject_Business Studies,Subject_Chemistry,Subject_Economics,Subject_ICT,Subject_Mathematics,Subject_Physics
0,1,1,0.498595,0,-0.000887,16/03/2022,-0.243683,13/05/2022,60,0.47985,...,True,False,False,False,False,False,False,False,True,False
1,2,1,0.498595,0,-0.000887,11/04/2022,-0.836622,19/04/2022,35,-1.442412,...,True,False,False,False,False,False,False,False,False,True
2,3,1,0.498595,0,1.414378,10/06/2020,-0.791011,31/08/2020,39,1.402536,...,True,False,False,False,False,False,False,True,False,False
3,4,2,1.498347,0,1.414378,22/09/2023,-1.657614,06/10/2023,24,-1.21174,...,True,False,False,False,False,False,False,False,True,False
4,5,2,1.498347,0,-0.70852,07/11/2020,0.121203,16/11/2020,51,-1.403966,...,True,False,False,False,False,False,False,False,False,True


In [40]:
# Prepare features and target
X = df.drop(['ID No', 'Student No','Study Time', 'Previous Test Date', 'Latest Test Date'], axis=1).values
# Reshape target to be in the shape (n_samples, 1) for compatibility with Keras
y = y.reshape(-1, 1)

In [41]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
X_train = X_train.astype('float32')
y_train = y_train.astype('float32')
X_test = X_test.astype('float32')
y_test = y_test.astype('float32')

In [43]:
# Normalize target variable
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1))

In [44]:
# Adjust these parameters as needed
n_timesteps = 1 # Number of time steps in your data; adjusted based on your data's shape
n_features = X_train.shape[1] # Number of features

In [46]:
# Constructing the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(n_timesteps, n_features), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(50, activation='relu', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error')

In [47]:
# Reshaping input to be 3D [samples, timesteps, features] for LSTM
X_train_reshaped = X_train.reshape((X_train.shape[0], n_timesteps, n_features))
X_test_reshaped = X_test.reshape((X_test.shape[0], n_timesteps, n_features))

In [48]:
# Check for NaN or inf values
if np.any(np.isnan(X_train_reshaped)) or np.any(np.isnan(y_train)):
    raise ValueError('The training data contains NaN values.')

if np.any(np.isinf(X_train_reshaped)) or np.any(np.isinf(y_train)):
    raise ValueError('The training data contains infinite values.')

In [49]:
# Train the model
history = model.fit(X_train_reshaped, y_train, epochs=20, validation_split=0.2, batch_size=64, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [54]:
# Evaluate the model
loss = model.evaluate(X_test_reshaped, y_test_scaled, verbose=1)
print(f'Test loss: {loss}')
lstm_predictions_scaled = model.predict(X_test_reshaped)

Test loss: 13411.1728515625


In [55]:
# lstm_predictions = model.predict(X_test_reshaped)
# print(f'Prediction: {lstm_predictions}')

In [56]:
# Inverse transform the predictions and actual values to their original scale
actual_study_time = scaler_y.inverse_transform(y_test_scaled)
predicted_study_time = scaler_y.inverse_transform(lstm_predictions_scaled)

In [57]:
# Calculate error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(actual_study_time, predicted_study_time)
mse = mean_squared_error(actual_study_time, predicted_study_time)
rmse = mean_squared_error(actual_study_time, predicted_study_time, squared=False)
r2 = r2_score(actual_study_time, predicted_study_time)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")


Mean Absolute Error (MAE): 4502.48876953125
Mean Squared Error (MSE): 21715238.0
Root Mean Squared Error (RMSE): 4659.9609375
R² Score: -13485.17311428645
