In [1]:
import os
import nltk
import pandas as pd 
import re

from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

#download a library of English stop words and the semantic word database.
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Function originally from: https://www.programcreek.com/python/?CodeExample=get%20wordnet%20pos
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    # Return the tag, if the tag is not found return noun.
    return tag_dict.get(tag, wordnet.NOUN)

def load_text_documents(folder_path):
    document_texts = []
    document_labels = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    text = f.read()

                #added the stemming function from Week 3 to address the issue of singular and plural forms being counted as separate words in the final results
                def simple_stemmer(word):
                     stemming_rules = [(r's\b', '')]
                     for pattern, replacement in stemming_rules:
                          word = re.sub(pattern, replacement, word)
                     return word

                lemmatizer = WordNetLemmatizer()
                lemmitized_text = " ".join([lemmatizer.lemmatize(simple_stemmer(word), get_wordnet_pos(word)) for word in text.split()])
                document_texts.append(lemmitized_text)
                document_labels.append(os.path.basename(file[:-4]))
    
    return document_texts, document_labels

# an lsa_analysis function is defined to perform LSA topic modeling analysis on folders of different colors separately
def lsa_analysis(folder_path, num_topics=5, num_terms=10):
    document_texts, document_labels = load_text_documents(folder_path)
    #adjust wikipedia_text_stop_words iteratively, eliminating high-frequency words that offer little value in topic analysis
    english_stop_words = stopwords.words('english')
    wikipedia_text_stop_words = [
    'refer', 'may', 'often', 'also', 'refers', 'one', 'use', 'set', 'thus', 'include', 'game','flag','ir','could','century','example','match','large','call',
    'make', 'th', 'bc', 'nm', 'many', 'cause', 'well', 'form', 'first', 'well', 'light','name','team','colour','become','since',
    'ipl', 'two', 'afl', 'final', 'primary', 'led', 'arc', 'leds', 'around', 'de', 'time','people','year','ex','range','play','sir','state','century',
    'ft', 'found', 'von','ha','wa','thi','hi','early','day','new','la','au','work','ga','color','art','history','world','nm','bce','th','red','orange','yellow','green','cyan','blue','purple','black','white','pink']
    stop_words = english_stop_words+ wikipedia_text_stop_words

    #running previous code resulted in numerous numbers, so I added a step to remove numerical interference. as Reference URL: https://stackoverflow.com/questions/12851791/removing-numbers-from-string
    document_texts = [''.join([i for i in text if not i.isdigit()]) for text in document_texts]

    #use the `CountVectorizer` class to get our bag of words features for each document
    vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=(1,1))
    tf_idf = vectorizer.fit_transform(document_texts)
    vocab = vectorizer.get_feature_names_out()
    
    #convert the TF-IDF vectors into a pandas DataFrame for further analysis.
    tfidf_df = pd.DataFrame(tf_idf.toarray(), columns=vocab, index=document_labels)
    tfidf_df

    #create a list of labels, each corresponding to a topic in the LSA model.
    labels = ['topic{}'.format(i) for i in range(num_topics)]

    svd = TruncatedSVD(n_components = num_topics, n_iter = 100) #You can change n_iter: Higher numbers will take longer but may (or may not) give you better results
    svd_topic_vectors = svd.fit_transform(tfidf_df.values)

    topic_weights = pd.DataFrame(svd.components_.T, index=vocab, columns=labels)
    topic_weights.sample(20)

    for i in range(num_topics):
        print("___topic " + str(i) + "___")
        topicName = "topic" + str(i)
        #sort the topic weights and get the top terms for each topic
        weightedlist = topic_weights.get(topicName).sort_values()[-num_terms:]
        #print not only the top 10 most important keywords but also output their weights, rounded to 2 decimal places.
        for word, weight in weightedlist.items():
            print(f"{word}: {weight:.2f}")
    
#list of colors to analyze
colors = ["red","orange","yellow","green","cyan","blue","purple","black","white","pink"] 
#directory containing the data
data_folder = "/Users/yinshuodi/Desktop/mini-project-23006440/Webcrawling/code&data/data/my-data"

#iterate through each color and perform LSA analysis
for color in colors:
    print(f"LSA Analysis for {color} documents:")
    #construct the path to the folder for the current color
    folder_path = os.path.join(data_folder, color)
    #perform LSA analysis on the documents in the current folder
    lsa_analysis(folder_path)
    #print a newline for better readability
    print("\n")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yinshuodi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yinshuodi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/yinshuodi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


LSA Analysis for red documents:
___topic 0___
maya: 0.09
culture: 0.10
sport: 0.10
renaissance: 0.10
laser: 0.11
emotion: 0.12
scarlet: 0.13
war: 0.14
china: 0.15
roman: 0.15
___topic 1___
behavior: 0.08
rgb: 0.08
brand: 0.08
emotional: 0.10
commentator: 0.10
athlete: 0.15
sport: 0.19
psychology: 0.20
anger: 0.28
emotion: 0.39
___topic 2___
dwarf: 0.13
spot: 0.14
sunrise: 0.15
cmyk: 0.15
wavelength: 0.15
mar: 0.15
opsin: 0.16
ink: 0.17
laser: 0.24
rgb: 0.24
___topic 3___
pigment: 0.09
vermilion: 0.09
rgb: 0.09
football: 0.11
commentator: 0.12
race: 0.12
sport: 0.13
ferrari: 0.13
laser: 0.16
scarlet: 0.32
___topic 4___
ink: 0.09
painter: 0.10
rgb: 0.11
painting: 0.11
anger: 0.12
vermilion: 0.14
artist: 0.16
emotion: 0.16
renaissance: 0.20
scarlet: 0.20


LSA Analysis for orange documents:
___topic 0___
produce: 0.09
spice: 0.10
orpiment: 0.10
egyptian: 0.11
pumpkin: 0.11
india: 0.11
national: 0.13
carrot: 0.13
rgb: 0.14
pigment: 0.15
___topic 1___
mixture: 0.09
combination: 0.10
boldfac