In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install keras-tuner

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.regularizers import l1_l2
from keras_tuner.tuners import RandomSearch
import numpy as np
import joblib

In [None]:
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DataSet_FYP/student_study_schedule_modified.csv')

In [None]:
df.head()

Unnamed: 0,ID No,Student No,Age,Sex,Stream,Subject,Difficulty Level,Previous Test Date,Previous Mark,Latest Test Date,Latest Mark,Date Count,Study Time
0,1,1,20,M,Physical science,Mathematics,3,16/03/2022,51,13/05/2022,60,58,80
1,2,1,20,M,Physical science,Physics,3,11/04/2022,38,19/04/2022,35,8,90
2,3,1,20,M,Physical science,ICT,5,10/06/2020,39,31/08/2020,39,82,160
3,4,2,22,M,Physical science,Mathematics,5,22/09/2023,20,06/10/2023,24,14,160
4,5,2,22,M,Physical science,Physics,2,07/11/2020,59,16/11/2020,51,9,70


In [None]:
# One-hot encode the 'Sex' column
df = pd.get_dummies(df, columns=['Sex'])

In [None]:
# # One-hot encode categorical variables
# df = pd.get_dummies(df, columns=['Stream', 'Subject'])

In [None]:
df.head()

Unnamed: 0,ID No,Student No,Age,Stream,Subject,Difficulty Level,Previous Test Date,Previous Mark,Latest Test Date,Latest Mark,Date Count,Study Time,Sex_F,Sex_M
0,1,1,20,Physical science,Mathematics,3,16/03/2022,51,13/05/2022,60,58,80,False,True
1,2,1,20,Physical science,Physics,3,11/04/2022,38,19/04/2022,35,8,90,False,True
2,3,1,20,Physical science,ICT,5,10/06/2020,39,31/08/2020,39,82,160,False,True
3,4,2,22,Physical science,Mathematics,5,22/09/2023,20,06/10/2023,24,14,160,False,True
4,5,2,22,Physical science,Physics,2,07/11/2020,59,16/11/2020,51,9,70,False,True


In [None]:
# Separate the target variable
y = df['Study Time']
X = df.drop(['Study Time', 'Previous Test Date', 'Latest Test Date', 'ID No', 'Student No', 'Stream', 'Subject'], axis=1)


In [None]:
# Scale the features and target
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)
scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

In [None]:
# Reshape input to be [samples, time steps, features] which is required for LSTM
X_train_reshaped = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_reshaped = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

In [None]:
# Define the model builder function for keras-tuner
def build_model(hp):
    model = Sequential()
    model.add(LSTM(units=hp.Int('units', min_value=32, max_value=512, step=32),
                   activation='relu',
                   input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]),
                   return_sequences=True,
                   kernel_regularizer=l1_l2(l1=hp.Float('l1', 1e-5, 1e-2, sampling='log'),
                                            l2=hp.Float('l2', 1e-5, 1e-2, sampling='log'))))
    model.add(Dropout(hp.Float('dropout', 0, 0.5, step=0.1)))
    model.add(LSTM(units=hp.Int('units', min_value=32, max_value=512, step=32), activation='relu'))
    model.add(Dropout(hp.Float('dropout', 0, 0.5, step=0.1)))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model
    pass

In [None]:
# Initialize keras tuner
tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=3,
    executions_per_trial=1,
    directory='my_dir',
    project_name='study_time_prediction'
)

In [None]:
# Perform hyperparameter search
tuner.search(X_train_reshaped, y_train, epochs=3, validation_split=0.2, batch_size=64)

Trial 3 Complete [00h 02m 25s]
val_loss: 0.030886976048350334

Best val_loss So Far: 0.030886976048350334
Total elapsed time: 00h 15m 21s


In [None]:
# Get the best hyperparameters and build the final model
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
model = build_model(best_hps)
model.fit(X_train_reshaped, y_train, epochs=50, batch_size=64, validation_split=0.2,verbose=1,
          callbacks=[EarlyStopping(monitor='val_loss', patience=10)])

In [None]:
# Evaluate the model
lstm_predictions = model.predict(X_test_reshaped)
lstm_predictions = scaler_y.inverse_transform(lstm_predictions)
lstm_mse = mean_squared_error(scaler_y.inverse_transform(y_test), lstm_predictions)
print(f"LSTM MSE: {lstm_mse}")

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Inverse transform the predictions and true y values back to the original scale if they were scaled
actual_study_time = scaler_y.inverse_transform(y_test)
predicted_study_time = scaler_y.inverse_transform(lstm_predictions)

# Calculate the regression metrics
mae = mean_absolute_error(actual_study_time, predicted_study_time)
mse = mean_squared_error(actual_study_time, predicted_study_time)
rmse = np.sqrt(mse)  # or directly use mean_squared_error with squared=False to get RMSE
r2 = r2_score(actual_study_time, predicted_study_time)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")


In [None]:
# Save the model and scalers
model.save('best_lstm_model.keras')
model.save('best_lstm_model.h5')
joblib.dump(scaler_X, 'scaler_X.pkl')
joblib.dump(scaler_y, 'scaler_y.pkl')

  saving_api.save_model(


['scaler_y.pkl']