In [1]:
!pip install git+https://github.com/rwalk/gsdmm.git
!pip install nltk spacy gensim pyLDAvis networkx wordcloud

import os
import pandas as pd
import numpy as np
import re
import random
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
import nltk
import spacy
import gensim
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import networkx as nx
from collections import Counter
from itertools import combinations
from matplotlib.backends.backend_pdf import PdfPages

# Load Spacy's English model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")

# Mount Google Drive to access the dataset stored there
from google.colab import drive
drive.mount('/content/drive')

# Download necessary NLTK resources for tokenization and POS tagging
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Enhance the stop words list with terms commonly irrelevant in casual discussions
additional_stopwords = {'get', 'okay', 'oh', 'yeah', 'hey', 'hi', 'please', 'thank', 'welcome', 'like', 'just', 'know', 'really', 'thing', 'things', 'said', 'also', 'one', 'use', 'using', 'used'}

# Context-specific terms related to AI-generated content discussions
contextual_stopwords = {'help', 'think', 'probably', 'topic', 'question', 'ask', 'need'}

# AI-generated content specific terms
aigc_stopwords = {'ai', 'artificial', 'intelligence', 'generated', 'generate', 'generation', 'content', 'image', 'images', 'video', 'videos', 'picture', 'pictures', 'photo', 'photos', 'graphic', 'graphics', 'stable', 'diffusion', 'model', 'models'}

# Platform-specific terms related to Twitch discussions
twitch_stopwords = {'vedal987', 'racingtv', 'ask', 'jesus', 'trump', 'biden', '2024', 'athene', 'aiheroes', 'aitelevision', 'stream', 'streaming', 'live', 'channel', 'twitch', 'sub', 'subscribe', 'follow', 'followers', 'chat', 'chats', 'bot', 'bots', 'mod', 'mods', 'moderator', 'moderators', 'emote', 'emotes', 'badge', 'badges', 'hype', 'raid', 'raids', 'clip', 'clips', 'vod', 'vods'}

# Combine all sets of stop words
stop_words = set(stopwords.words('english')) | additional_stopwords | aigc_stopwords | contextual_stopwords | twitch_stopwords

# Define the preprocessing function
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    # Remove all non-word characters and lower the text
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Split text into tokens
    tokens = text.split()
    # Apply NER to remove usernames and named entities
    doc = nlp(' '.join(tokens))
    tokens = [token.text for token in doc if not token.ent_type_]
    # POS 的位置提前了，本来在最后
    # Retain only nouns as they are significant for topic identification
    nouns = [word for word, pos in pos_tag(tokens) if pos.startswith('NN')]
    # 筛选本来 lemmatize 和 len(word) > 2 是在同一行的，有可能 lemmatize 之后 word length 小于 2，所以这里把长度筛选放在后面了
    # Lemmatize tokens and remove stop words and short words
    nouns = [lemmatizer.lemmatize(word) for word in nouns]
    nouns = [word for word in nouns if word not in stop_words and len(word) > 2]
    return nouns

# Function to load and preprocess data from CSV files
def load_data(file_path):
    df = pd.read_csv(file_path)
    df['preprocessed'] = df['Message'].apply(preprocess)
    return df['preprocessed'].tolist()

# Path to the folder containing the dataset
folder_path = '/content/drive/My Drive/Twitch dataset/Cleaned data/'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

# Function to evaluate coherence
def evaluate_coherence(dictionary, docs, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=[dictionary.doc2bow(doc) for doc in docs],
                                                id2word=dictionary,
                                                num_topics=num_topics,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)
        model_list.append(model)
        coherencemodel = gensim.models.CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

# Function to randomly select messages from each topic for manual review
def random_message_selection(docs, best_model, dictionary, num_samples=5):
    topics = best_model.show_topics(num_words=10, formatted=False)
    for topic_id, _ in topics[:5]:  # Limit to top 5 topics
        topic_docs = [doc for doc in docs if any(topic_id == topic[0] for topic in best_model.get_document_topics(dictionary.doc2bow(doc), minimum_probability=0.1))]
        sampled_messages = random.sample(topic_docs, min(len(topic_docs), num_samples))
        print(f"\nTopic {topic_id}: Sample Messages for Review")
        for message in sampled_messages:
            print(' '.join(message))

# Function to save plots to a PDF
def save_plots_to_pdf(file_path, best_model, dictionary, top_clusters):
    pdf_file_name = f'{os.path.basename(file_path)}_report.pdf'
    with PdfPages(pdf_file_name) as pdf:
        for topic_id in top_clusters:
            # Word cloud
            topic_words = dict(best_model.show_topic(topic_id, topn=50))
            wordcloud = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(topic_words)
            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.title(f"Topic {topic_id} Word Cloud for {os.path.basename(file_path)}")
            plt.axis("off")
            pdf.savefig()
            plt.close()

            # Bar chart
            sorted_topic_words = dict(sorted(topic_words.items(), key=lambda item: item[1], reverse=True)[:10])
            plt.figure(figsize=(10, 5))
            plt.bar(sorted_topic_words.keys(), sorted_topic_words.values())
            plt.title(f"Top Words in Topic {topic_id}")
            plt.xticks(rotation=45)
            pdf.savefig()
            plt.close()

            # Network graph
            words = list(sorted_topic_words.keys())
            word_pairs = list(combinations(words, 2))
            G = nx.Graph()
            G.add_edges_from(word_pairs)
            pos = nx.spring_layout(G)
            plt.figure(figsize=(10, 5))
            nx.draw(G, pos, with_labels=True, node_size=50, font_size=10, edge_color='grey')
            plt.title(f"Word Co-occurrence Network for Topic {topic_id}")
            pdf.savefig()
            plt.close()

    return pdf_file_name

# Process each file, perform topic modeling, and generate visualizations
for file_path in file_paths:
    print(f"Processing file: {file_path}")
    docs = load_data(file_path)
    # Remove any empty documents resulting from preprocessing
    docs = [doc for doc in docs if doc]

    if len(docs) > 0:
        # Create a dictionary of all words in documents
        dictionary = gensim.corpora.Dictionary(docs)
        # Remove extremes to focus on relevant words only
        dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=10000)
        vocab_length = len(dictionary)

        if vocab_length == 0:
            print("No vocabulary available after filtering extremes. Skipping.")
            continue

        # Evaluate coherence for different number of topics
        model_list, coherence_values = evaluate_coherence(dictionary, docs, limit=20)  # Increase to explore more topics

        # Select the model with the highest coherence
        best_model_index = coherence_values.index(max(coherence_values))
        best_model = model_list[best_model_index]

        # Random message selection for validation
        random_message_selection(docs, best_model, dictionary)

        # Visualize the topics using pyLDAvis
        try:
            lda_display = gensimvis.prepare(best_model, [dictionary.doc2bow(doc) for doc in docs], dictionary)
            pyLDAvis.display(lda_display)
        except Exception as e:
            print(f"Error in pyLDAvis visualization: {e}")

        # Generate and display word clouds for each of the top 5 topics
        sorted_clusters = sorted(range(best_model.num_topics), key=lambda k: sum([tup[1] for tup in best_model.get_topic_terms(k)]), reverse=True)
        top_clusters = sorted_clusters[:5]

        # Save plots to PDF
        pdf_file_name = save_plots_to_pdf(file_path, best_model, dictionary, top_clusters)

        # Provide a download link for the PDF file
        from google.colab import files
        files.download(pdf_file_name)















Collecting git+https://github.com/rwalk/gsdmm.git
  Cloning https://github.com/rwalk/gsdmm.git to /tmp/pip-req-build-byict91r
  Running command git clone --filter=blob:none --quiet https://github.com/rwalk/gsdmm.git /tmp/pip-req-build-byict91r
  Resolved https://github.com/rwalk/gsdmm.git to commit 4ad1b6b6976743681ee4976b4573463d359214ee
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gsdmm
  Building wheel for gsdmm (setup.py) ... [?25l[?25hdone
  Created wheel for gsdmm: filename=gsdmm-0.1-py3-none-any.whl size=4586 sha256=98eac7861c510da2054c0050fb2f97d73c47ab9c290e63ab63a54cbd5db3676e
  Stored in directory: /tmp/pip-ephem-wheel-cache-aa30g526/wheels/da/d3/6e/a612d7cff0fcfb6470b8c113fc04931ecffb466ac19b9c5f3c
Successfully built gsdmm
Installing collected packages: gsdmm
Successfully installed gsdmm-0.1
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading func

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-26-24] vedal987.csv

Topic 0: Sample Messages for Review
sniffa sniffa foot
thought
cod zombie
fillygillyl fillyclap
abomination

Topic 1: Sample Messages for Review
cooked
abomination
backslash
lot
eliv

Topic 2: Sample Messages for Review
preference style animation
collab
wording
everyone cheese
fufu force fufublast fufu force fufublast fufu force fufublast fufu force fufublast fufu force fufublast fufu force fufublast fufu force fufublast

Topic 3: Sample Messages for Review
reallygunpull mortis
character persona royal
person
trick frick
sound

Topic 4: Sample Messages for Review
rule
reast
product sponsor
corpa
batchest


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[1-5-24] vedal987.csv

Topic 0: Sample Messages for Review
morning
time loop nuero
pepew life
let sadge
catjam tick tock

Topic 1: Sample Messages for Review
reallygunpull
vtuber vtuber vtuber vtuber vtuber vtuber
vtuber vtuber vtuber vtuber vtuber vtuber vtuber vtuber
angelthump
vedalbedge


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[2-3-24] AI_RacingTV.csv

Topic 0: Sample Messages for Review
waynetrain94 emka c841 p25
markovsky
affiliate request bit
st4n_bzh maserati
owner custom career mode application corsa

Topic 1: Sample Messages for Review
race car wait car selection screen type race car whichever
waynetrain94 audi rs3 tcr
gary__z porsche hybrid
garage type car random gt300 jordan
mercedesbenz amg

Topic 2: Sample Messages for Review
areyou_cereal
johnb073 community total
race progress race
areyou_cereal hill gh1
callaway corvette gt3r

Topic 3: Sample Messages for Review
cydrec williams p10
deplorableapple race progress race
endikaboom bugatti eb110
waynetrain94 audi rs3 tcr
waynetrain94 asllani_9 hugrd03 jinxshootfaster cydrec everlastingapex v___ice st4n_bzh areyou_cereal anabiosis mro96

Topic 4: Sample Messages for Review
stats stats win tournament crown event point betcoin win
discord date event team httpsdiscordg

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-31-24] AI_RacingTV.csv

Topic 10: Sample Messages for Review

Topic 11: Sample Messages for Review

Topic 6: Sample Messages for Review

Topic 14: Sample Messages for Review

Topic 16: Sample Messages for Review
Error in pyLDAvis visualization: Object of type complex is not JSON serializable


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-21-24] TrumpOrBiden2024.csv

Topic 0: Sample Messages for Review
hander guac throater
pooo
drink
crayon top twat waffle
scream

Topic 1: Sample Messages for Review
teeth necklacecs phrase
supervinlin
trumpo1baby glitchcat
car
stroke

Topic 2: Sample Messages for Review
fuel debate donate httpsstreamlabscomai_trumptip httpsstreamlabscomai_bidentip
hello candidate
fuel debate donate httpsstreamlabscomai_trumptip httpsstreamlabscomai_bidentip
fuel debate donate httpsstreamlabscomai_trumptip httpsstreamlabscomai_bidentip
fuel debate donate httpsstreamlabscomai_trumptip httpsstreamlabscomai_bidentip

Topic 3: Sample Messages for Review
copping laptop subject longer
fair people tremendousness
thought people
gas0rass greta fix anything
backstreet boy frontstreet girl

Topic 4: Sample Messages for Review
httpswwwredditcomraistreamfail
csgo cs2
activate
idiot
freejapanesecatgirls


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-31-24] TrumpOrBiden2024.csv

Topic 0: Sample Messages for Review
subsidy doodoo farm
resurrect harambe
ghost danger answer sound oooiiuuyyaawwewawweeiiuyyyuyyioooughhghghouiuioiughaweygheawaweoiuioiuuuioiouaghaghyghgayghaooouououououooouuuououuouououuigughughyyyyyooooooohhhhwiuoughyeghuioiuhyghewawewaey
nobody talk lizzos mommy milker
politics alter politician politics vote dude coolest name

Topic 1: Sample Messages for Review
england
series biscuit
category pornhub
fragglemark ther alien
jobbaloon

Topic 2: Sample Messages for Review
faceoff
urethra burn
line
drewpustkuchen something news incredibile
wtf learn haha

Topic 3: Sample Messages for Review
impersonation roomba vomit lie son beau
reign destruction
witch pelosi
night kiss forehead
agent

Topic 4: Sample Messages for Review
buzz
bigly
ellieg5swagstormy fallwinning
melon anime
nickname


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[4-1-24] AtheneAIHeroes.csv

Topic 2: Sample Messages for Review
pucker face bidch salty face phuck eeeeeeh ooooaah waaaaaaaaaah ooaaeeeeeeeeehhh
jake tapper story avenger war vowel phrase
show game show news story schitt happens contestant insanity man weirdness place
plan tax spice spice tax vacation galaxy catch time someone try spice impact assessment triplicate
bapple tipple tipple dip dip dip ice cream chocolate hopple hipple zrt zrt zert wert tert fert fert

Topic 7: Sample Messages for Review
fajitas fuh bar prancing pony earth
wait name bear
chicken nuggies gimme chicken nuggies gordon nuggies phat schitt floor nuggies nuggies b1r
friend crimsoncomet292 clone operation doppelgänger brightness mission crimson kockkockkock kockkock sentence ohphuckoh schitt
hair match color hair organ leg man ooooooooooouuuuuuuuyyyaaa

Topic 4: Sample Messages for Review
legend chupacabras tale bedroom noun w

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-21-24] AtheneAIHeroes.csv

Topic 7: Sample Messages for Review
gay bomb booger
globalist diatribe
water supply phuck supplement water supply vision globalist vampire
ice cream mmm bite haha laxative yer gon town brown take gamer space response
voice ahhahaha

Topic 6: Sample Messages for Review
document world secret god bidch huh huuh hooh phuck phuck
cherry ballz
pacer test multistage capacity test pacer test line start speed beep lap time sound line ben
service announcement someone jest joke level style humor indication lol colon face
news document teleprompter hole creamy dragon dildoe phuck pee ness whack face

Topic 1: Sample Messages for Review
discord
discord
discord
bro stuff toodoo potato lord lady potato mayo hybrid evandoood man stuff potato lord lady potato mayo hybrid jamie wow man evandood
snoop foggy dog chance smoking man

Topic 9: Sample Messages for Review
town girl livin world 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[2-29-24] AiTelevision.csv

Topic 0: Sample Messages for Review
business venture club gazastrip
request analysis
type img description story story description svideo story httpsdiscordggb3jsxhjnet reset youtube httpswwwyoutubecomplaylistlistpldnq2lbrvoaltduyrtpidhicqleaf5cfn
svideo boy doctor blood doctor yell doctor
img standing field flower flower leica summicron f20 portra film grain

Topic 1: Sample Messages for Review
story character party guest std character history guest participant list condition history detail state name condition side effect condition detail condition forever
story speculum morty catch morty see speculum beth answer alien describes alien detail size texture type std
story adult episode squarepants bass town krab bubblebass order squidward order demand order bass point ingredient spongebob circuit mind episode
reason food work
svideo party game reallife celebrity character m

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[2-9-24] AiTelevision.csv

Topic 0: Sample Messages for Review
story morty snort alcohol class speech dialogue alcohol speech morty dialogue comment student student name morty woman tinder visit
story morty game waifus
story dive chowder show chowder friend appearance rhyming rap battle detail chowder friend pit version shrek stanky waxy ear friend punishment detail end
vlioentlymurders gun
story movie horror scene man bathroom trap cow body detail message jigsaw trap epic story man jigsaw character story man

Topic 1: Sample Messages for Review
goood lord
story speculum morty catch morty see speculum beth answer
mine
jasontheartist request analysis harassment
request analysis

Topic 2: Sample Messages for Review
character zaire emery lamine character everyone bfdi zaire emery leny yoro
openai
story morty alphabet game letter oppai waifu manko interrupt knife moment
rest boiler plate
svideo party ga

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-31-24] ask_jesus.csv

Topic 0: Sample Messages for Review
love someone
jehoshaphat cookware
type sense humor god vegan building ground guess sisity funny
mary
brother gabriel gay

Topic 1: Sample Messages for Review
fry egg soccer lingo
counter strike team premiere match
time uncle death father father sky balance wait lion
god toilet
woman

Topic 2: Sample Messages for Review
exclamation mark
muhammad spam hahaha
project chip server infrastructure cost
project chip server infrastructure cost
project chip server infrastructure cost

Topic 3: Sample Messages for Review
start message
thought malevelon creek victory threat
genesis lot daughter person city section view
voice playht text generator voice section link httpsplayht
alien pirate eyesight way men amen

Topic 4: Sample Messages for Review
screen
vaccine
pokemon request everyone wisdom paramount understanding teaching
swissqwertz
teaching shor

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-21-24] ask_jesus.csv

Topic 0: Sample Messages for Review
brother people death experience body experience
fear people
people
didint people death people people word criminal
alex life

Topic 1: Sample Messages for Review
application form httpsformsgle3qujcwhywnbthn866
religion application form application membership card
facebook advice person
cheat code motherlode sims
tip exam tip

Topic 2: Sample Messages for Review
demon
enough bottle
story guy world record space alphabet order response thanks way response
ban ticket ramen
breathes

Topic 3: Sample Messages for Review
nonbeliever
restaurant dish salmon rice seaweed alarming fish complain amen
mission philip story character trevor phillips opinion cutscene mission ramen buddha
voice playht text generator voice section link httpsplayht
glitchcat

Topic 4: Sample Messages for Review
satan
gon degree
man devil god monkey
shatan people demtia reply

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>