In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:


def drop_missing_descriptions(df):
    """
    Drop rows with missing module descriptions.
    """
    df.dropna(subset=['All Module Details'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def process_text(text):
    """
    Tokenize, remove stop words, stem, and lemmatize the text.
    """
    # Tokenize the text into words
    tokens = word_tokenize(text.lower())
    
    # Remove stop words from the tokens
    stop_words = set(stopwords.words('english'))

    # add more stop words that are not useful for your analysis
    additional_stop_words = set(['module', 'modules', 'course', 'courses', 'level', "aims", "aim", "covers", "essential", "equip", "students", "provide", "understanding","develop"])
    stop_words = stop_words | additional_stop_words

    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Perform stemming and lemmatization on each token
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    
    # Join the stemmed and lemmatized tokens back into a single text string
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text

def preprocess_descriptions(df):
    """
    Apply the text processing function to the module descriptions.
    """
    df['All Module Details'] = df['All Module Details'].apply(process_text)
    return df

def generate_document_term_matrix(df):
    """
    Generate a document-term matrix for the preprocessed descriptions.
    """
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['All Module Details'])
    feature_names = vectorizer.get_feature_names()
    return X, feature_names

def apply_lda(X, feature_names, num_topics=5):
    """
    Apply LDA to the document-term matrix to find topics.
    """
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)
    lda.fit(X)
    for topic_idx, topic in enumerate(lda.components_):
        print(f"Topic {topic_idx}:")
        top_feature_indices = topic.argsort()[:-11:-1]
        for feature_index in top_feature_indices:
            print(f"\t{feature_names[feature_index]}")
        print("")

def find_topics(df, num_topics=3):
    """
    Find topics in the dataframe using LDA.
    """
    df = drop_missing_descriptions(df)
    df = preprocess_descriptions(df)
    X, feature_names = generate_document_term_matrix(df)
    apply_lda(X, feature_names, num_topics)

def merge_data(selected_columns):
    """
    Load dataframes from files and select only desired columns.
    """
    ntu_df = pd.read_excel("../../Backend/Data/university_courses_graph/NTU_course_info.xlsx")
    sit_df = pd.read_csv("../../Backend/Data/university_courses_graph/SIT_Module_Info.csv")
    nus_df = pd.read_csv("../../Backend/Data/university_courses_graph/nus_dsa_mods.csv")
    smu_df = pd.read_csv("../../Backend/Data/university_courses_graph/SMU_course_info.csv")
    suss_df = pd.read_csv("../../Backend/Data/university_courses_graph/SUSS_course_info.csv")
    sutd_df = pd.read_csv("../../Backend/Data/university_courses_graph/SUTD_course_info.csv")

    # Select only the desired columns
    ntu_selected = ntu_df.loc[:, selected_columns]
    sit_selected = sit_df.loc[:, selected_columns]
    nus_selected = nus_df.loc[:, selected_columns]
    smu_selected = smu_df.loc[:, selected_columns]
    suss_selected = suss_df.loc[:, selected_columns]
    sutd_selected = sutd_df.loc[:, selected_columns]

    # Concatenate the selected dataframes
    merged_df = pd.concat([ntu_selected, sit_selected, nus_selected, smu_selected, suss_selected, sutd_selected], axis=0)

    # Reset the index of the merged dataframe
    df = merged_df.reset_index(drop=True)
    return df

selected_columns = ["University", "Module Title", "Module Code", "All Module Details", "Module Level"]
df = merge_data(selected_columns)
find_topics(df)




Topic 0:
	algorithm
	statist
	problem
	distribut
	method
	comput
	analysi
	search
	use
	model

Topic 1:
	learn
	comput
	problem
	data
	program
	machin
	basic
	design
	use
	model

Topic 2:
	data
	system
	use
	design
	softwar
	learn
	applic
	network
	model
	problem

