In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:

def drop_missing_descriptions(df):
    """
    Drop rows with missing module descriptions.
    """
    df.dropna(subset=['All Module Details'], inplace=True)
    return df
    
def preprocess_descriptions(df):
    """
    Apply the text processing function to the module descriptions.
    """
    df['All Module Details'] = df['All Module Details'].apply(process_text)
    return df


def generate_tfidf_matrix(df):
    """
    Apply the vectorizer to the preprocessed descriptions to generate a matrix of tf-idf scores.
    """
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['All Module Details'])
    return X


def calculate_cosine_similarity(X):
    """
    Calculate the cosine similarity matrix for the tf-idf matrix.
    """
    cosine_sim = cosine_similarity(X)
    return cosine_sim


def get_label_name(index, df):
    """
    Get the label name of a module given its index and dataframe.
    """
    module_code = df.loc[index, 'Module Code']
    module_title = df.loc[index, 'Module Title']
    university = df.loc[index, "University"]
    return f"{university} | {module_code}: {module_title}"


def find_similar_modules(df, threshold=0.5):
    """
    Find similar modules in the dataframe using cosine similarity.
    """
    df = drop_missing_descriptions(df)
    df = preprocess_descriptions(df)
    X = generate_tfidf_matrix(df)
    cosine_sim = calculate_cosine_similarity(X)

    for i in range(len(df)):
        for j in range(i+1, len(df)):
            if cosine_sim[i][j] > threshold:
                module1 = get_label_name(i, df)
                module2 = get_label_name(j, df)
                print(f"{module1}\n{module2}\nSimilarity Score: {cosine_sim[i][j]}\n")
                
                
def merge_data(selected_columns):
    """
    Load dataframes from files and select only desired columns.
    """
    ntu_df = pd.read_excel("/Users/micolechan/Desktop/dsa3101/project/dsa3101-2220-12-ds/Backend/Data/university_courses_graph/NTU_course_info.xlsx")
    sit_df = pd.read_csv("/Users/micolechan/Desktop/dsa3101/project/dsa3101-2220-12-ds/Backend/Data/university_courses_graph/SIT_Module_Info.csv")
    nus_df = pd.read_csv("/Users/micolechan/Desktop/dsa3101/project/dsa3101-2220-12-ds/Backend/Data/university_courses_graph/nus_dsa_mods.csv")
    smu_df = pd.read_csv("/Users/micolechan/Desktop/dsa3101/project/dsa3101-2220-12-ds/Backend/Data/university_courses_graph/SMU_course_info.csv")
    suss_df = pd.read_csv("/Users/micolechan/Desktop/dsa3101/project/dsa3101-2220-12-ds/Backend/Data/university_courses_graph/SUSS_course_info.csv")
    sutd_df = pd.read_csv("/Users/micolechan/Desktop/dsa3101/project/dsa3101-2220-12-ds/Backend/Data/university_courses_graph/SUTD_course_info.csv")

    ntu_selected = ntu_df.loc[:, selected_columns]
    sit_selected = sit_df.loc[:, selected_columns]
    nus_selected = nus_df.loc[:, selected_columns]
    smu_selected = smu_df.loc[:, selected_columns]
    suss_selected = suss_df.loc[:, selected_columns]
    sutd_selected = sutd_df.loc[:, selected_columns]

    # Concatenate the selected dataframes
    merged_df = pd.concat([ntu_selected, sit_selected, nus_selected, smu_selected, suss_selected, sutd_selected], axis=0)

    # Reset the index of the merged dataframe
    df = merged_df.reset_index(drop=True)
    return df

selected_columns = ["University", "Module Title", "Module Code", "All Module Details", "Module Level"]
df = merge_data(selected_columns)
find_similar_modules(df)



NTU | CZ1007: Data Structures& Algorithms
NTU | CZ2001: Algorithm Design and Analysis
Similarity Score: 0.6579124926342328

NTU | CZ1007: Data Structures& Algorithms
SIT | ICT1008: Data Structures and Algorithms
Similarity Score: 0.5344445948406812

NTU | CZ1007: Data Structures& Algorithms
NUS | CS2040: Data Structures and Algorithms
Similarity Score: 0.6345751342186826

NTU | CZ1007: Data Structures& Algorithms
SUTD | 50.003: Elements of Software Construction
Similarity Score: 0.5011072387455398

NTU | CZ2001: Algorithm Design and Analysis
SUTD | 50.003: Elements of Software Construction
Similarity Score: 0.5650395497547952

NTU | CZ2006: Software Engineering
SIT | ICT2101: Introduction to Software Engineering
Similarity Score: 0.5250684435691017

NTU | CZ2006: Software Engineering
SUTD | 50.002: Computation Structures
Similarity Score: 0.5156929319818436

NTU | MH2500: Probability and Introduction to Statistics
NUS | ST2131: Probability
Similarity Score: 0.6800391319593884

NTU | MH