In [37]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Read the CSV file into a pandas dataframe
def read_dataframe(filepath):
    df = pd.read_csv(filepath)
    return df

# Drop rows with missing module descriptions
def drop_missing_descriptions(df):
    df.dropna(subset=['Module Description '], inplace=True)
    return df

# Process the text by tokenizing, removing stop words, stemming, and lemmatizing
def process_text(text):
    # Tokenize the text into words
    tokens = word_tokenize(text.lower())
    
    # Remove stop words from the tokens
    stop_words = set(stopwords.words('english'))

    # add more stop words that are not useful for your analysis
    additional_stop_words = set(['module', 'modules', 'course', 'courses', 'level'])
    stop_words = stop_words | additional_stop_words

    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Perform stemming and lemmatization on each token
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    
    # Join the stemmed and lemmatized tokens back into a single text string
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text

# Apply the text processing function to the module descriptions
def preprocess_descriptions(df):
    df['Module Description '] = df['Module Description '].apply(process_text)
    return df

# Apply the vectorizer to the preprocessed descriptions to generate a matrix of tf-idf scores
def generate_tfidf_matrix(df):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['Module Description '])
    return X

# Calculate the cosine similarity matrix for the tf-idf matrix
def calculate_cosine_similarity(X):
    cosine_sim = cosine_similarity(X)
    return cosine_sim

def get_label_name(index, df):
    module_code = df.loc[index, 'Module Code']
    module_title = df.loc[index, 'Module Title']
    return f"SIT | {module_code}: {module_title}"


# SIT
df = read_dataframe("/Users/micolechan/Desktop/dsa3101/project/dsa3101-2220-12-ds/Data/university_courses/SIT_Module_Info.csv")
df = drop_missing_descriptions(df)
df = preprocess_descriptions(df)
X = generate_tfidf_matrix(df)
cosine_sim = calculate_cosine_similarity(X)

# set the threshold value
threshold = 0.2

for i in range(len(df)):
    for j in range(i+1, len(df)):
        if cosine_sim[i][j] > threshold:
            module1 = get_label_name(i, df)
            module2 = get_label_name(j, df)
            print(f"{module1}\n{module2}\nSimilarity Score: {cosine_sim[i][j]}\n")


SIT | CSC1002: Mathematics 1
SIT | CSC1006: Mathematics 2
Similarity Score: 0.3727960633771082

SIT | CSC1009: Object-Oriented Programming
SIT | ICT2101: Introduction to Software Engineering
Similarity Score: 0.2512909036495344

SIT | DCM1112: Data Engineering and Visualization
SIT | CSC3005: Data Analytics 
Similarity Score: 0.21089145769291798

SIT | ICT2101: Introduction to Software Engineering
SIT | CSC2003: Embedded Systems Programming
Similarity Score: 0.20728276147501

SIT | ICT2101: Introduction to Software Engineering
SIT | CSC2008: Database Systems
Similarity Score: 0.26345279078708356

SIT | ICT2101: Introduction to Software Engineering
SIT | ICT3211: Integrative Team Project
Similarity Score: 0.3281455452189659

SIT | ICT2101: Introduction to Software Engineering
SIT | ICT4001: Capstone Project
Similarity Score: 0.22261002524153858

SIT | ICT3211: Integrative Team Project
SIT | ICT4001: Capstone Project
Similarity Score: 0.27875147560513236



Unnamed: 0,Module Code,Module Title,Module Category,Module Level,Credit,Department,Module Description,Topics,Prerequisites,Preclusion,Link,Remark
0,CSC1002,Mathematics 1,Mathematics,1.0,6,CSC,first mathemat equip student core mathemat kno...,"Basic logic, functions, relations and sets, se...",,,https://www.singaporetech.edu.sg/modules/mathe...,
1,CSC1001,Introduction to Computer Systems,Computer Science,1.0,6,CSC,intend introductori provid overview differ tau...,This module covers wide variety of topics rang...,,,https://www.singaporetech.edu.sg/modules/intro...,
2,ICT1002,Programming Fundamentals,Computer Science,1.0,6,ICT,foundat provid essenti follow term ict programm .,The topics covered in this module include Intr...,,,https://www.singaporetech.edu.sg/modules/progr...,
3,CSC1006,Mathematics 2,Mathematics,1.0,6,CSC,second mathemat equip student core mathemat kn...,"Topics covered include descriptive statistics,...",CSC1002 or equivalent,,https://www.singaporetech.edu.sg/modules/mathe...,
4,CSC3009,Machine Learning,Data Science,3.0,6,CSC,present student introduct gener theori learn d...,"The main machine learning methods: regression,...",,,https://www.singaporetech.edu.sg/modules/machi...,
5,ICT1008,Data Structures and Algorithms,Computer Science,1.0,6,ICT,introduc fundament concept data structur compl...,"Topics include recursion, fundamental data str...",,,https://www.singaporetech.edu.sg/modules/data-...,
6,ICT1004,Web Systems and Technologies,Computer Science,1.0,6,ICT,applic increasingli web-bas . cover essenti we...,"Topics include HTML, CSS, JavaScript for clien...",,,https://www.singaporetech.edu.sg/modules/web-s...,
7,CSC1009,Object-Oriented Programming,Computer Science,1.0,6,CSC,aim introductori enabl student learn basic lan...,The module gives coverage of fundamental algor...,,,https://www.singaporetech.edu.sg/modules/objec...,
8,DCM1112,Data Engineering and Visualization,Data Analytics,1.0,6,DCM,cover fundament visualis quantit data . whilst...,Fundamentals of cisualising quantitative data.,,,https://www.singaporetech.edu.sg/modules/data-...,
9,PHE2019,Ethics and Professional Conducts,Other,2.0,6,PHE,cover ethic engin societi . student case studi...,Engineers in society; Roles and responsibiliti...,,,https://www.singaporetech.edu.sg/modules/ethics,
