In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import math
from sklearn.preprocessing import normalize, OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from pickle import encode_long
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [None]:
processed_student = pd.read_excel('/content/drive/MyDrive/Nhóm 5 - DS317.P11/Đồ án môn học/Dataset/Processed/Processed Data With Pre-Processed Data/processed_student.xlsx', index_col=0)
processed_course = pd.read_excel('/content/drive/MyDrive/Nhóm 5 - DS317.P11/Đồ án môn học/Dataset/Processed/Processed Data With Pre-Processed Data/processed_course.xlsx', index_col=0)
processed_score = pd.read_excel('/content/drive/MyDrive/Nhóm 5 - DS317.P11/Đồ án môn học/Dataset/Processed/Processed Data With Pre-Processed Data/processed_score.xlsx', index_col=0)
subject_popularity = pd.read_excel('/content/drive/MyDrive/Nhóm 5 - DS317.P11/Đồ án môn học/Dataset/Processed/Processed Data With Pre-Processed Data/subject_popularity.xlsx', index_col = 0)
group_course = pd.read_excel('/content/drive/MyDrive/Nhóm 5 - DS317.P11/Đồ án môn học/Dataset/Processed/Processed Data With Pre-Processed Data/group_course.xlsx', index_col=0)
group_sum_course = pd.read_excel('/content/drive/MyDrive/Nhóm 5 - DS317.P11/Đồ án môn học/Dataset/Processed/Processed Data With Pre-Processed Data/group_sum_course.xlsx')

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Nhóm 5 - DS317.P11/Đồ án mô học/Dataset/Processed/Processed Data With Pre-Processed Data/processed_student.xlsx'

In [None]:
class DataPreprocessor:
    def __init__(self, student_df, score_df, course_df, subject_popularity_df, group_sum_course_df):
        self.student = student_df
        self.score = score_df
        self.course = course_df
        self.subject = subject_popularity_df
        self.group_course = group_sum_course_df
        self.ordinal_encoder = OrdinalEncoder()
        self.nhomloaimh_mapping = None  # Store mapping between encoded and original labels

    def prepare_data(self):
        # Encode student features
        student_features = self.student[['mssv', 'khoa', 'namhoc_batdau']].copy()
        one_hot_encoder = OneHotEncoder()
        student_features_encoded = one_hot_encoder.fit_transform(student_features[['khoa']]).toarray()
        self.student_features = pd.concat(
            [student_features[['mssv', 'namhoc_batdau']], pd.DataFrame(student_features_encoded)], axis=1
        )

        # Normalize subject popularity features (instead of course features)
        subject_features = self.subject[['mamh', 'dophobien']].copy()
        scaler = MinMaxScaler()
        subject_features['dophobien_scaled'] = scaler.fit_transform(subject_features[['dophobien']])
        self.ordinal_encoder = self.ordinal_encoder.fit(self.course[['nhomloaimh']])

        self.course['nhomloaimh_encoded'] = self.ordinal_encoder.transform(self.course[['nhomloaimh']])
        # Save the mapping for decoding later
        self.nhomloaimh_mapping = {
        encoded: original
        for encoded, original in zip(
            self.course['nhomloaimh_encoded'], self.course['nhomloaimh']
        )
    }
        self.course['nhomloaimh'] = self.course['nhomloaimh_encoded']
        self.course_features = subject_features  # Use this as course-related features
        self.course_features = pd.merge(subject_features, self.course[['mamh', 'nhomloaimh']], on='mamh', how='left')
        # Prepare interaction data
        interactions = self.score[['mssv', 'mamh', 'diem']].copy()
        interactions['label'] = (interactions['diem'] >= 5).astype(int)  # 1 if passed, 0 if failed
        return self.student_features, self.course_features, interactions

    def get_group_course(self, faculty: str, year: int, term: int) -> pd.DataFrame:
        group_course_result = self.group_course.loc[
            (self.group_course['khoa'] == faculty) &
            (self.group_course['sohocky'] == term),
        :]
        #If the result is empty try with year - 1:
        if group_course_result.empty:
          group_course_result = self.group_course.loc[
            (self.group_course['khoa'] == faculty) &
            (self.group_course['namhoc'] < year) &
            (self.group_course['sohocky'] == term),
        :]
        # If still empty, consider all years
        if group_course_result.empty:
          group_course_result = self.group_course.loc[
              (self.group_course['khoa'] == faculty) &
              (self.group_course['sohocky'] == term),
          :]
        group_course_result = group_course_result[['nhomloaimh', 'somonhoc']]
        group_course_result = group_course_result.groupby('nhomloaimh').mean()
        group_course_result['somonhoc'] = group_course_result['somonhoc'].apply(lambda x: math.ceil(x))
        return group_course_result.reset_index()

    def get_faculty(self, mssv: str) -> str:
        return str(self.student[self.student['mssv'] == mssv].loc[:, 'khoa'].values[0])

    def get_year(self, mssv: str, term: int) -> int:
        return int(self.score[(self.score['mssv'] == mssv) & (self.score['sohocky'] == term)]['namhoc'].values[0])

    def get_highest_term(self, mssv: str) -> int:
        return int(self.score[self.score['mssv'] == mssv]['sohocky'].max())

class RecommendationSystem:
    def __init__(self, mlp_model, preprocessor):
        self.mlp_model = mlp_model
        self.preprocessor = preprocessor

    def recommend_by_group(self, mssv, term):
        # Lấy thông tin về khoa, năm, và kỳ học cao nhất
        faculty = self.preprocessor.get_faculty(mssv)
        max_term = self.preprocessor.get_highest_term(mssv)
        print("max term", max_term)
        print("term", term)
        if(max_term < term):
          term = max_term
        # else:
        #   term = term - 1
        year = self.preprocessor.get_year(mssv, term)

        # Lấy thông tin nhóm môn học
        group_course = self.preprocessor.get_group_course(faculty, year, term)
        encoder = OrdinalEncoder()
        group_course['nhomloaimh'] = encoder.fit_transform(group_course[['nhomloaimh']])
        # Kết quả gợi ý cho từng nhóm
        recommendations_by_group = {}
        for group_id in group_course['nhomloaimh']:

            # Decode the group name
            group_name = self.preprocessor.nhomloaimh_mapping.get(float(group_id), f"Nhóm {group_id}")

            # Lọc môn học theo nhóm
            courses_in_group = self.preprocessor.course_features[
                self.preprocessor.course_features['nhomloaimh'] == group_id
            ]

            # Lọc môn học chưa học
            learned_courses = self.preprocessor.score[self.preprocessor.score['mssv'] == mssv]['mamh'].unique()

            courses_not_learned = courses_in_group[~courses_in_group['mamh'].isin(learned_courses)]

            # Remove duplicates based on the course ID ('mamh')
            courses_not_learned = courses_not_learned.drop_duplicates(subset='mamh')

            # Nếu không còn môn học nào, bỏ qua nhóm
            if courses_not_learned.empty:
                continue

            # Tạo vector đặc trưng
            student_feature = self.preprocessor.student_features[self.preprocessor.student_features['mssv'] == mssv]
            student_vector = student_feature.drop(columns=['mssv']).values
            course_ids = courses_not_learned['mamh'].values
            course_features = courses_not_learned.drop(columns=['mamh']).values
            student_course_vectors = np.hstack([np.repeat(student_vector, len(course_features), axis=0), course_features])
            student_course_vectors = student_course_vectors.astype(np.float32)
            # Dự đoán điểm số

            expected_input_shape = self.mlp_model.model.input_shape[1]  # Get expected input size

            # Check if the input data shape matches the expected shape
            if student_course_vectors.shape[1] != expected_input_shape:
                # If shapes do not match, try to reshape or adjust your data
                # Here's a basic example of reshaping, assuming you need to drop a column:
                student_course_vectors = student_course_vectors[:, :expected_input_shape]

            scores = self.mlp_model.predict(student_course_vectors).flatten()
            # Reranking dựa trên độ phổ biến của môn học và điểm của sinh viên
            popularity_scores = courses_not_learned['dophobien_scaled'].values
            avg_past_score = self.preprocessor.score[self.preprocessor.score['mssv'] == mssv]['diem'].mean()
            final_scores = 0.6 * scores + 0.2 * popularity_scores + 0.2 * avg_past_score # Tùy chỉnh trọng số

            # Gợi ý Top n môn học của nhóm
            top_n = int(group_course.loc[group_course['nhomloaimh'] == group_id]['somonhoc'])
            top_indices = np.argsort(final_scores)[::-1][:top_n]
            recommended_courses = course_ids[top_indices]
            # Lưu kết quả
            recommendations_by_group[group_name] = recommended_courses
        recommendations = []
        for group_name, recommended_courses in recommendations_by_group.items():
            recommendations.extend(recommended_courses)
        recommendations = pd.Series(recommendations)
        return recommendations

class MLPModel:
    def __init__(self, input_size, learning_rate=0.00001):
        self.model = Sequential([
            Dense(128, input_dim=input_size, activation='relu'),
            Dropout(0.2),
            Dense(64, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='sigmoid')
        ])
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        self.model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    def train(self, X_train, y_train, X_val, y_val, epochs=10, batch_size=32):
        self.model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs, batch_size=batch_size, verbose=2
        )

    def predict(self, X):
        return self.model.predict(X)

In [None]:
# Khởi tạo Preprocessor và Recommendation System
preprocessor = DataPreprocessor(processed_student, processed_score, processed_course, subject_popularity, group_course)
student_features, course_features, interactions = preprocessor.prepare_data()
# Merge student and course features
merged_data = interactions.merge(student_features, on='mssv').merge(course_features, on='mamh')

# Create input and labels
X = merged_data.drop(columns=['mssv', 'mamh', 'diem', 'label']).values
y = merged_data['label'].values

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def evaluate(mssv: str, term: int, recommended_courses: list[str]):
    if len(recommended_courses) == 0:
      return 0, 0, 0
    actual_courses = processed_score[(processed_score['mssv']==mssv) & (processed_score['sohocky'] == term)]['mamh'].unique()

    true_positive_count = recommended_courses.isin(actual_courses).sum()
    precision = true_positive_count / len(recommended_courses) if len(recommended_courses) > 0 else 0
    recall = true_positive_count / len(actual_courses) if len(actual_courses) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score


In [None]:
import random
from tqdm import tqdm

random.seed(42)
# Lọc các sinh viên bắt đầu học năm 2014
student_2014 = processed_student[processed_student['namhoc_batdau'] == 2014]['mssv'].unique()

terms = [2, 3, 4, 5]
filtered_scores = processed_score[processed_score['sohocky'].isin(terms) & processed_score['mssv'].isin(student_2014)]

student_with_all_terms = filtered_scores.groupby('mssv').filter(lambda x: set(x['sohocky']) == set(terms))['mssv'].unique()

student_with_all_terms = list(student_with_all_terms)


qualified_students = list(set(student_2014) & set(student_with_all_terms))
selected_students = random.sample(qualified_students, 100)

lr_list = [10e-8, 10e-7, 10e-6, 10e-5, 10e-4, 10e-3, 10e-2, 10e-1, 10e1, 10e2]
epoch_range = 20
batch_size_list = [1, 2, 4, 8, 16, 32, 64, 128, 256]

gridsearch_results = []
for lr in tqdm(lr_list):
  for epoch in range(1, 1 + epoch_range):
    for batch_size in batch_size_list:
      mlp = MLPModel(input_size=X_train.shape[1], learning_rate=lr)
      mlp.train(X_train, y_train, X_val, y_val, epochs=epoch, batch_size=batch_size)
      recommendation_system = RecommendationSystem(mlp, preprocessor)
      evaluation_results = []
      for i in range(2, 6):
        for mssv in selected_students:
            recommendations_by_group = recommendation_system.recommend_by_group(mssv, i)
            precision, recall, f1_score = evaluate(mssv, i, recommendations_by_group)
            evaluation_results.append({
                'mssv': mssv,
                'term': i,
                'precision': precision,
                'recall': recall,
                'f1_score': f1_score
            })

      evaluation_df = pd.DataFrame(evaluation_results)
      mean_precision_by_term = evaluation_df.groupby('term')['precision'].mean()
      mean_precision = evaluation_df['precision'].mean()
      mean_recall_by_term = evaluation_df.groupby('term')['recall'].mean()
      mean_recall = evaluation_df['recall'].mean()
      mean_f1_score_by_term = evaluation_df.groupby('term')['f1_score'].mean()
      mean_f1_score = evaluation_df['f1_score'].mean()
      gridsearch_results.append({
          'learning rate': lr,
          'epoch': epoch,
          'batch size': batch_size,
          'mean_precision': mean_precision,
          'mean_recall': mean_recall,
          'mean_f1_score': mean_f1_score
      })
gridsearch_df = pd.DataFrame(gridsearch_results)
gridsearch_df.to_excel('/content/drive/MyDrive/Nhóm 5 - DS317.P11/Đồ án mô học/Gridsearch Finetune/Finetune With Pre-Processed Data/Baseline 5/gridsearch_results.xlsx')