In [2]:
import os
import nltk
import pandas as pd
import re

from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#download a library of English stop words and the semantic word database.
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Function originally from: https://www.programcreek.com/python/?CodeExample=get%20wordnet%20pos
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    # Return the tag, if the tag is not found return noun. 
    return tag_dict.get(tag, wordnet.NOUN)

def load_text_documents(folder_path):
    document_texts = []
    document_labels = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    text = f.read()

                #added the stemming function from Week 3 to combine singular and plural forms in the results
                def simple_stemmer(word):
                     stemming_rules = [(r's\b', '')]
                     for pattern, replacement in stemming_rules:
                          word = re.sub(pattern, replacement, word)
                     return word

                lemmatizer = WordNetLemmatizer()
                lemmitized_text = " ".join([lemmatizer.lemmatize(simple_stemmer(word), get_wordnet_pos(word)) for word in text.split()])
                document_texts.append(lemmitized_text)
                document_labels.append(os.path.basename(file[:-4]))
    
    return document_texts, document_labels

# an lda_analysis function is defined to perform LDA topic modeling analysis on folders of different colors separately
def lda_analysis(folder_path, num_topics=5, num_terms=10):
    document_texts, document_labels = load_text_documents(folder_path)
    #adjust wikipedia_text_stop_words iteratively, eliminating high-frequency words that offer little value in topic analysis
    english_stop_words = stopwords.words('english')
    wikipedia_text_stop_words = [
    'refer', 'may', 'often', 'also', 'refers', 'one', 'use', 'set', 'thus', 'include', 'game','flag','ir','could','century','example','match','large','call',
    'make', 'th', 'bc', 'nm', 'many', 'cause', 'well', 'form', 'first', 'well', 'light','name','team','colour','become','since',
    'ipl', 'two', 'afl', 'final', 'primary', 'led', 'arc', 'leds', 'around', 'de', 'time','people','year','ex','range','play','sir','state','century',
    'ft', 'found', 'von','ha','wa','thi','hi','early','day','new','la','au','work','ga','color','art','history','world','nm','bce','th','red','orange','yellow','green','cyan','blue','purple','black','white','pink']
    stop_words = english_stop_words+ wikipedia_text_stop_words

    #running previous code resulted in numerous numbers, so I added a step to remove numerical interference，as Ref: https://stackoverflow.com/questions/12851791/removing-numbers-from-string
    document_texts = [''.join([i for i in text if not i.isdigit()]) for text in document_texts]

    #use the `CountVectorizer` class to get our bag of words features for each document
    vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=(1,1))
    bag_of_words = vectorizer.fit_transform(document_texts)
    vocab = vectorizer.get_feature_names_out()

    bow_df = pd.DataFrame(bag_of_words.toarray(), columns=vocab, index=document_labels)
    bow_df

    #create a list of labels, each corresponding to a topic in the LDA model.
    labels = ['topic{}'.format(i) for i in range(num_topics)]

    lda = LatentDirichletAllocation(n_components=num_topics,random_state=123, learning_method='batch')
    lda_topics = lda.fit_transform(bag_of_words)

    topic_weights = pd.DataFrame(lda.components_.T, index=vocab, columns=labels)
    topic_weights.sample(20)

    for i in range(num_topics):
        print("___topic " + str(i) + "___")
        topicName = "topic" + str(i)
        #sort the topic weights and get the top terms for each topic
        weightedlist = topic_weights.get(topicName).sort_values()[-num_terms:]
        #print not only the top 10 most important keywords but also output their weights, rounded to 2 decimal places.
        for word, weight in weightedlist.items():
            print(f"{word}: {weight:.2f}")
    
#list of colors to analyze
colors = ["red","orange","yellow","green","cyan","blue","purple","black","white","pink"] 
#directory containing the data
data_folder = "/Users/yinshuodi/Desktop/mini-project-23006440/Webcrawling/code&data/data/my-data"

#iterate through each color and perform LDA analysis
for color in colors:
    print(f"LDA Analysis for {color} documents:")
    #construct the path to the folder for the current color
    folder_path = os.path.join(data_folder, color)
    #perform LDA analysis on the documents in the current folder
    lda_analysis(folder_path)
    #print a newline for better readability 
    print("\n")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yinshuodi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yinshuodi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/yinshuodi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


LDA Analysis for red documents:
___topic 0___
person: 65.63
expression: 67.78
rome: 70.99
emotional: 72.18
social: 82.93
theory: 90.72
roman: 129.40
war: 175.00
anger: 314.19
emotion: 346.10
___topic 1___
goal: 52.25
psychologist: 52.46
research: 55.62
chariot: 71.67
performance: 93.83
ferrari: 114.20
psychology: 152.14
race: 165.48
athlete: 177.20
sport: 309.06
___topic 2___
area: 68.55
system: 77.41
classic: 80.12
power: 87.50
high: 93.99
period: 108.20
chinese: 146.52
maya: 302.20
laser: 305.20
china: 381.92
___topic 3___
vermilion: 61.20
national: 64.19
surface: 67.61
association: 69.13
player: 73.39
planet: 74.92
scarlet: 91.61
earth: 94.22
football: 114.20
mar: 230.10
___topic 4___
body: 58.27
star: 62.20
thermodynamic: 77.20
culture: 83.73
dwarf: 84.20
temperature: 92.04
system: 93.12
transfer: 101.20
energy: 118.43
heat: 206.06


LDA Analysis for orange documents:
___topic 0___
show: 41.66
theory: 42.12
cup: 44.20
pigment: 45.17
produce: 46.17
study: 47.20
different: 48.54
pump