In [6]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

This code is designed to find similar university course modules using cosine similarity. The code first reads in course information from multiple universities, including NTU, SIT, NUS, SMU, SUSS, and SUTD, and selects the desired columns. Then, the code preprocesses the module descriptions by removing stop words and stemming the text. The preprocessed descriptions are used to generate a matrix of tf-idf scores, which is used to calculate the cosine similarity matrix for the tf-idf matrix. The find_similar_modules function then iterates over each pair of modules and calculates their cosine similarity scores. If the score is above a certain threshold, which is set to 0.5 by default, the function outputs the module codes, titles, and universities of the two similar modules along with their similarity score. This output helps identify similar modules across different universities, which may be useful for students interested in exploring different course options.

In [21]:

def drop_missing_descriptions(df):
    """
    Drop rows with missing module descriptions.
    """
    df.dropna(subset=['All Module Details'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def process_text(text):
    """
    Tokenize, remove stop words, stem, and lemmatize the text.
    """
    # Tokenize the text into words
    tokens = word_tokenize(text.lower())
    
    # Remove stop words from the tokens
    stop_words = set(stopwords.words('english'))

    # add more stop words that are not useful for your analysis
    additional_stop_words = set(['module', 'modules', 'course', 'courses', 'level', "aims", "aim", "covers", "essential", "equip", "students", "provide", "understanding","develop"])
    stop_words = stop_words | additional_stop_words

    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Perform stemming and lemmatization on each token
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    
    # Join the stemmed and lemmatized tokens back into a single text string
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text

def preprocess_descriptions(df):
    """
    Apply the text processing function to the module descriptions.
    """
    df['All Module Details'] = df['All Module Details'].apply(process_text)
    return df


def generate_tfidf_matrix(df):
    """
    Apply the vectorizer to the preprocessed descriptions to generate a matrix of tf-idf scores.
    """
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['All Module Details'])
    return X


def calculate_cosine_similarity(X):
    """
    Calculate the cosine similarity matrix for the tf-idf matrix.
    """
    cosine_sim = cosine_similarity(X)
    return cosine_sim


def get_label_name(index, df):
    """
    Get the label name of a module given its index and dataframe.
    """
    module_code = df.loc[index, 'Module Code']
    module_title = df.loc[index, 'Module Title']
    university = df.loc[index, "University"]
    return f"{university} | {module_code}: {module_title}"


def find_similar_modules(df, threshold=0.5):
    """
    Find similar modules in the dataframe using cosine similarity.
    """
    df = drop_missing_descriptions(df)
    df = preprocess_descriptions(df)
    X = generate_tfidf_matrix(df)
    cosine_sim = calculate_cosine_similarity(X)

    for i in range(len(df)):
        for j in range(i+1, len(df)):
            if cosine_sim[i][j] > threshold:
                module1 = get_label_name(i, df)
                module2 = get_label_name(j, df)
                print(f"{module1}\n{module2}\nSimilarity Score: {cosine_sim[i][j]}\n")
                
                
def merge_data(selected_columns):
    """
    Load dataframes from files and select only desired columns.
    """
    ntu_df = pd.read_excel("/Users/micolechan/Desktop/dsa3101/project/dsa3101-2220-12-ds/Backend/Data/university_courses_graph/NTU_course_info.xlsx")
    sit_df = pd.read_csv("/Users/micolechan/Desktop/dsa3101/project/dsa3101-2220-12-ds/Backend/Data/university_courses_graph/SIT_Module_Info.csv")
    nus_df = pd.read_csv("/Users/micolechan/Desktop/dsa3101/project/dsa3101-2220-12-ds/Backend/Data/university_courses_graph/nus_dsa_mods.csv")
    smu_df = pd.read_csv("/Users/micolechan/Desktop/dsa3101/project/dsa3101-2220-12-ds/Backend/Data/university_courses_graph/SMU_course_info.csv")
    suss_df = pd.read_csv("/Users/micolechan/Desktop/dsa3101/project/dsa3101-2220-12-ds/Backend/Data/university_courses_graph/SUSS_course_info.csv")
    sutd_df = pd.read_csv("/Users/micolechan/Desktop/dsa3101/project/dsa3101-2220-12-ds/Backend/Data/university_courses_graph/SUTD_course_info.csv")

    ntu_selected = ntu_df.loc[:, selected_columns]
    sit_selected = sit_df.loc[:, selected_columns]
    nus_selected = nus_df.loc[:, selected_columns]
    smu_selected = smu_df.loc[:, selected_columns]
    suss_selected = suss_df.loc[:, selected_columns]
    sutd_selected = sutd_df.loc[:, selected_columns]

    # Concatenate the selected dataframes
    merged_df = pd.concat([ntu_selected, sit_selected, nus_selected, smu_selected, suss_selected, sutd_selected], axis=0)

    # Reset the index of the merged dataframe
    df = merged_df.reset_index(drop=True)
    return df

selected_columns = ["University", "Module Title", "Module Code", "All Module Details", "Module Level"]
df = merge_data(selected_columns)
find_similar_modules(df)

NTU | CZ1007: Data Structures& Algorithms
NTU | CZ2001: Algorithm Design and Analysis
Similarity Score: 0.6579124926342328

NTU | CZ1007: Data Structures& Algorithms
SIT | ICT1008: Data Structures and Algorithms
Similarity Score: 0.5344445948406812

NTU | CZ1007: Data Structures& Algorithms
NUS | CS2040: Data Structures and Algorithms
Similarity Score: 0.6345751342186826

NTU | CZ1007: Data Structures& Algorithms
SUTD | 50.004: Algorithms
Similarity Score: 0.5011072387455398

NTU | CZ2001: Algorithm Design and Analysis
SUTD | 50.004: Algorithms
Similarity Score: 0.5650395497547952

NTU | CZ2006: Software Engineering
SIT | ICT2101: Introduction to Software Engineering
Similarity Score: 0.5250684435691017

NTU | CZ2006: Software Engineering
SUTD | 50.003: Elements of Software Construction
Similarity Score: 0.5156929319818436

NTU | MH2500: Probability and Introduction to Statistics
NUS | ST2131: Probability
Similarity Score: 0.6800391319593884

NTU | MH2500: Probability and Introduction 

In [25]:
def find_similar_modules_gephi(df, threshold=0.5):
    """
    Find similar modules in the dataframe using cosine similarity and output a dataframe with nodes and edges.csv for Gephi.
    """
    df = drop_missing_descriptions(df)
    df = preprocess_descriptions(df)
    X = generate_tfidf_matrix(df)
    cosine_sim = calculate_cosine_similarity(X)

    nodes = []
    edges = []

    for i in range(len(df)):
        node_id = i
        node_label = f"{df.loc[i, 'University']} | {df.loc[i, 'Module Code']}: {df.loc[i, 'Module Title']}"
        nodes.append((node_id, node_label))

    for i in range(len(df)):
        for j in range(i+1, len(df)):
            if cosine_sim[i][j] > threshold:
                node1_id = i
                node2_id = j
                weight = cosine_sim[i][j]
                edges.append((node1_id, node2_id, weight))

    nodes_df = pd.DataFrame(nodes, columns=['Id', 'Label'])
    edges_df = pd.DataFrame(edges, columns=['Source', 'Target', 'Weight'])

    nodes_df['Type'] = 'Module'
    edges_df['Type'] = 'Undirected'

    nodes_df.to_csv('/Users/micolechan/Desktop/dsa3101/project/dsa3101-2220-12-ds/Backend/Code/Graph/Similarity/similarity_nodes.csv', index=False)
    edges_df.to_csv('/Users/micolechan/Desktop/dsa3101/project/dsa3101-2220-12-ds/Backend/Code/Graph/Similarity/similarity_edges.csv', index=False)

find_similar_modules_gephi(df, 0.2)

In [18]:
df = drop_missing_descriptions(df)
df = preprocess_descriptions(df)
df

Unnamed: 0,University,Module Title,Module Code,All Module Details,Module Level
0,NTU,Introduction to Computational Thinking & Progr...,CZ1003,comput think ( ct ) process anali problem desi...,1.0
1,NTU,Calculus I and II,MH1805,"subject knowledg , logic reason analyt skill a...",1.0
2,NTU,Discrete Mathematics,MH1812,serv introduct variou topic discret mathemat ....,1.0
3,NTU,Inquiry and Communication in an Interdisciplin...,CC0001,write tool think . process organi thought word...,1.0
4,NTU,Navigating the Digital World,CC0002,"digitali becom new normal daili life , seek di...",1.0
...,...,...,...,...,...
173,SUTD,Parallel Computing on Multicore Architectures,50.049,henc core knowledg multicor processor architec...,
174,SUTD,Discrete Mathematics and Algorithm Design,50.05,in-depth comput scienc requir strong mathemat ...,
175,SUTD,Programming Language Concepts,50.051,", learn data repr , program compil . first par...",
176,SUTD,Extended Reality,50.052,extend realiti ( xr ) encapsul variou immer te...,
