In [1]:
!pip install git+https://github.com/rwalk/gsdmm.git
!pip install nltk spacy gensim pyLDAvis networkx wordcloud

import os
import pandas as pd
import numpy as np
import re
import random
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
import nltk
import spacy
import gensim
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import networkx as nx
from collections import Counter
from itertools import combinations
from matplotlib.backends.backend_pdf import PdfPages

# Load Spacy's English model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")

# Mount Google Drive to access the dataset stored there
from google.colab import drive
drive.mount('/content/drive')

# Download necessary NLTK resources for tokenization and POS tagging
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

general_stopwords = {
    'would', 'could', 'should', 'might', 'must', 'will', 'shall', 'can', 'may', 'wouldn', 'couldn', 'shouldn',
    'mightn', 'mustn', 'won', 'shan', 'can', 'mayn', 'just', 'don', 'didn', 'doesn', 'aren', 'isn', 'wasn',
    'weren', 'hasn', 'haven', 'hadn', 'does', 'did', 'don', 'does', 'did', 'don', 'now', 'then', 'once',
    'after', 'before', 'since', 'during', 'while', 'until', 'ago', 'yet', 'still', 'even', 'ever', 'always',
    'never', 'sometimes', 'often', 'usually', 'again', 'too', 'also', 'only', 'really', 'very', 'much',
    'more', 'most', 'many', 'several', 'few', 'some', 'any', 'each', 'every', 'all', 'both', 'either',
    'neither', 'anyone', 'everyone', 'someone', 'nobody', 'noone', 'nothing', 'anything', 'something',
    'everything', 'another', 'such', 'one', 'two', 'three', 'first', 'second', 'third', 'next', 'last',
    'same', 'other', 'different', 'new', 'old', 'young', 'long', 'short', 'high', 'low', 'large', 'small'
}

# Combine all sets of stop words
stop_words = set(stopwords.words('english')) | general_stopwords

# Define the preprocessing function
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    # Remove all non-word characters and lower the text
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Split text into tokens
    tokens = text.split()
    # Lemmatize tokens and remove stop words and short words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    # Apply NER to remove usernames and named entities
    doc = nlp(' '.join(tokens))
    tokens = [token.text for token in doc if not token.ent_type_]
    # Retain only nouns as they are significant for topic identification
    nouns = [word for word, pos in pos_tag(tokens) if pos.startswith('NN')]
    return nouns

# Function to load and preprocess data from CSV files
def load_data(file_path):
    df = pd.read_csv(file_path)
    df['preprocessed'] = df['Message'].apply(preprocess)
    return df['preprocessed'].tolist()

# Path to the folder containing the dataset
folder_path = '/content/drive/My Drive/Twitch dataset/Cleaned data/'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

# Function to evaluate coherence
def evaluate_coherence(dictionary, docs, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=[dictionary.doc2bow(doc) for doc in docs],
                                                id2word=dictionary,
                                                num_topics=num_topics,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)
        model_list.append(model)
        coherencemodel = gensim.models.CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

# Function to randomly select messages from each topic for manual review
def random_message_selection(docs, best_model, dictionary, num_samples=5):
    topics = best_model.show_topics(num_words=10, formatted=False)
    for topic_id, _ in topics[:5]:  # Limit to top 5 topics
        topic_docs = [doc for doc in docs if any(topic_id == topic[0] for topic in best_model.get_document_topics(dictionary.doc2bow(doc), minimum_probability=0.1))]
        sampled_messages = random.sample(topic_docs, min(len(topic_docs), num_samples))
        print(f"\nTopic {topic_id}: Sample Messages for Review")
        for message in sampled_messages:
            print(' '.join(message))

# Function to save plots to a PDF
def save_plots_to_pdf(file_path, best_model, dictionary, top_clusters):
    pdf_file_name = f'{os.path.basename(file_path)}_report.pdf'
    with PdfPages(pdf_file_name) as pdf:
        for topic_id in top_clusters:
            # Word cloud
            topic_words = dict(best_model.show_topic(topic_id, topn=50))
            wordcloud = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(topic_words)
            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.title(f"Topic {topic_id} Word Cloud for {os.path.basename(file_path)}")
            plt.axis("off")
            pdf.savefig()
            plt.close()

            # Bar chart
            sorted_topic_words = dict(sorted(topic_words.items(), key=lambda item: item[1], reverse=True)[:10])
            plt.figure(figsize=(10, 5))
            plt.bar(sorted_topic_words.keys(), sorted_topic_words.values())
            plt.title(f"Top Words in Topic {topic_id}")
            plt.xticks(rotation=45)
            pdf.savefig()
            plt.close()

            # Network graph
            words = list(sorted_topic_words.keys())
            word_pairs = list(combinations(words, 2))
            G = nx.Graph()
            G.add_edges_from(word_pairs)
            pos = nx.spring_layout(G)
            plt.figure(figsize=(10, 5))
            nx.draw(G, pos, with_labels=True, node_size=50, font_size=10, edge_color='grey')
            plt.title(f"Word Co-occurrence Network for Topic {topic_id}")
            pdf.savefig()
            plt.close()

    return pdf_file_name

# Process each file, perform topic modeling, and generate visualizations
for file_path in file_paths:
    print(f"Processing file: {file_path}")
    docs = load_data(file_path)
    # Remove any empty documents resulting from preprocessing
    docs = [doc for doc in docs if doc]

    if len(docs) > 0:
        # Create a dictionary of all words in documents
        dictionary = gensim.corpora.Dictionary(docs)
        # Remove extremes to focus on relevant words only
        dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=10000)
        vocab_length = len(dictionary)

        if vocab_length == 0:
            print("No vocabulary available after filtering extremes. Skipping.")
            continue

        # Evaluate coherence for different number of topics
        model_list, coherence_values = evaluate_coherence(dictionary, docs, limit=20)  # Increase to explore more topics

        # Select the model with the highest coherence
        best_model_index = coherence_values.index(max(coherence_values))
        best_model = model_list[best_model_index]

        # Random message selection for validation
        random_message_selection(docs, best_model, dictionary)

        # Visualize the topics using pyLDAvis
        try:
            lda_display = gensimvis.prepare(best_model, [dictionary.doc2bow(doc) for doc in docs], dictionary)
            pyLDAvis.display(lda_display)
        except Exception as e:
            print(f"Error in pyLDAvis visualization: {e}")

        # Generate and display word clouds for each of the top 5 topics
        sorted_clusters = sorted(range(best_model.num_topics), key=lambda k: sum([tup[1] for tup in best_model.get_topic_terms(k)]), reverse=True)
        top_clusters = sorted_clusters[:5]

        # Save plots to PDF
        pdf_file_name = save_plots_to_pdf(file_path, best_model, dictionary, top_clusters)

        # Provide a download link for the PDF file
        from google.colab import files
        files.download(pdf_file_name)















Collecting git+https://github.com/rwalk/gsdmm.git
  Cloning https://github.com/rwalk/gsdmm.git to /tmp/pip-req-build-_8m6w2ff
  Running command git clone --filter=blob:none --quiet https://github.com/rwalk/gsdmm.git /tmp/pip-req-build-_8m6w2ff
  Resolved https://github.com/rwalk/gsdmm.git to commit 4ad1b6b6976743681ee4976b4573463d359214ee
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gsdmm
  Building wheel for gsdmm (setup.py) ... [?25l[?25hdone
  Created wheel for gsdmm: filename=gsdmm-0.1-py3-none-any.whl size=4586 sha256=a167f6a4b062b848b85353d4c917824011f94c89494b59bd2da8049e443324a9
  Stored in directory: /tmp/pip-ephem-wheel-cache-d3x2cegr/wheels/da/d3/6e/a612d7cff0fcfb6470b8c113fc04931ecffb466ac19b9c5f3c
Successfully built gsdmm
Installing collected packages: gsdmm
Successfully installed gsdmm-0.1
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading func

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-26-24] vedal987.csv





Topic 0: Sample Messages for Review
huh right
caught
kekw mean
mood right
thanks

Topic 1: Sample Messages for Review
ctrl
roman empire
clock dummy
monster
modcheck

Topic 2: Sample Messages for Review
crnkek
hallelujah210 maya601 mod power
shame
fuck
prefer love evil

Topic 3: Sample Messages for Review
gura
evilcry edm evilcry edm evilcry edm evilcry edm evilcry edm evilcry edm evilcry edm evilcry edm evilcry edm evilcry edm evilcry edm evilcry edm
thanks vedalheart love
know
cat eat day

Topic 4: Sample Messages for Review
vedal mosquito
fine
wayyy
nope
grass


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[1-5-24] vedal987.csv

Topic 0: Sample Messages for Review
time call narrator vedal
stay catkiss stay catkiss stay catkiss stay catkiss stay catkiss
reallygunpull smadge reallygunpull smadge reallygunpull smadge reallygunpull smadge reallygunpull smadge reallygunpull smadge
bedge stream
vedal icant

Topic 1: Sample Messages for Review
see
rizzdal rizzdal
hear click
dinodance singsnote dinodance singsnote dinodance
watch stream

Topic 2: Sample Messages for Review
wishlist
pointless
trust
copium
sleepy

Topic 3: Sample Messages for Review
manage
fix gigavedal oshi fix gigavedal oshi fix gigavedal oshi fix gigavedal oshi fix gigavedal oshi fix gigavedal oshi fix gigavedal oshi fix gigavedal
corpa
copium
need

Topic 4: Sample Messages for Review
corpa
corpa
chickenpizzasteak sub gellatingel
chickenpizzasteak sub randomevilact
skip skip skip skip skip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[2-3-24] AI_RacingTV.csv

Topic 10: Sample Messages for Review
cyrenevr random car
cyrenevr random car
cyrenevr gt500 super
cyrenevr random car
cyrenevr dodge challenger hellcat

Topic 0: Sample Messages for Review
garage type car fiat acr street car falcon vintage street car
garage type car falcon vintage street car acr street car
falcon
falcon vintage street car
waynetrain94 falcon gtho car

Topic 15: Sample Messages for Review
hugrd03 pantera gts
garage type car fiat acr street car falcon vintage street car
garage type car jordan blind random oreca acura jordan ej10
p201 type garage look
mercury cougar random type garage look

Topic 3: Sample Messages for Review
pot amalgyte win robokop6000sux win
pot
wowkcb mondeo
pot zygote22 win
pot suzannegreenberg win

Topic 4: Sample Messages for Review
waynetrain94 porsche carrera
waynetrain94 f50
waynetrain94 random car
waynetrain94 lm
waynetrain94 car


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-31-24] AI_RacingTV.csv

Topic 9: Sample Messages for Review

Topic 8: Sample Messages for Review

Topic 14: Sample Messages for Review

Topic 5: Sample Messages for Review

Topic 0: Sample Messages for Review
texas_rangerswot driving
st4n_bzh flm09
st4n_bzh lamborghini murcielago
st4n_bzh camaro
st4n_bzh driving
Error in pyLDAvis visualization: Object of type complex is not JSON serializable


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-21-24] TrumpOrBiden2024.csv


  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)



Topic 8: Sample Messages for Review
fun talk poop eating
chess poop wanna play
biden mouth
trump athene sort thing poop poop trump poop bit byte screw nut rainsins
hole mouth rubber dingdong

Topic 4: Sample Messages for Review
president hop gon lot fun
voting election
donald peek gon world peace day
trump gon wall moon
sleepy gon empire propaganda uranus homelander dildo

Topic 10: Sample Messages for Review
trump dinner time checkmate
duck wife past time
trump demand tragedy deal cum riot time cum mother
queeniechaos talk way cat type trunanashabidaprzure
talk person lol

Topic 3: Sample Messages for Review
news flash twitch cumrag
lot question
ferabeast link goodbye stop spamming wordsphrases
trump viewer stop twitch ai nt shit
sleepy stop burn moon

Topic 16: Sample Messages for Review
tender crack addict biden trump maga hat
address room catboy shortage japanesecatgirls
news flash japanesecatgirls twump doangbeetle way
room turn
news flash japanesecatgirls twump doangbeetle way


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-31-24] TrumpOrBiden2024.csv

Topic 0: Sample Messages for Review
trump stuff chimpanzee
opinion allegation diddy
president voice attitude science
voice playht voice generator voice check section stream click link httpsplayht
trump hope biden empire strike maga trump

Topic 10: Sample Messages for Review
mixture pointy pleasure godzilla stool week crack happiness
president know day
explain grab someone cookis caveman speak
guy friend hate
wall pile car

Topic 3: Sample Messages for Review
foot picture
rp4agudtyme ill gatekeeper
show probers le body
look blue
trump kfc bucket body cavity mouth

Topic 4: Sample Messages for Review
subscribe fuel debate unlock httpswwwtwitchtvtrumporbiden2024subscribe
stream httpswwwredditcomraistreamfail
subscribe fuel debate unlock httpswwwtwitchtvtrumporbiden2024subscribe
sorry want make feel
stream httpswwwredditcomraistreamfail

Topic 2: Sample Messages for Revi

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[4-1-24] AtheneAIHeroes.csv

Topic 7: Sample Messages for Review
tell time dimension space box fit pool library toilet
sauce elevator tip lamb sauce safety
joke line b1rs b1rs b1rb1rs b1r b1rs str8 b1rs sideways b1rs b1rs space b1rs stick b1rs b1rs b1rs b1r b1r b1rs
chicken nuggies gimme chicken nuggies chicken nuggies phat floor nuggies nuggies nuggies b1r
athene chat

Topic 8: Sample Messages for Review
doctor giant junk check
story synphage greedythagoon synphage man greedythagoon hello name montoya b1r prepare cry
synphage
pausechamp friend
voice playht voice generator voice check section stream click link httpsplayht

Topic 2: Sample Messages for Review
pant
trainwrecks cart agent lot dude
time laforge cook food try bird meat troi environment commander bit conform commander riker
kid
heard snoop dis track poop fight cheese

Topic 1: Sample Messages for Review
gon phucking gon trash light scene 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-21-24] AtheneAIHeroes.csv

Topic 0: Sample Messages for Review
story
ramp frog throat word ground beef word
join globalists look jawline
opinion movie
ummm humm okay phuck ill thing suck bill thanks album

Topic 1: Sample Messages for Review
lmao
discord
march deed chain ignorance path truth heart justice tall flame path equality voice spread
turner tretment
burr billyy billyboi bald please hamlet backwards language uhohihahuhohihahah

Topic 2: Sample Messages for Review
pitch series doctor character time
case
tyler1 tell living
people street road saggystreet live
speedos seashark

Topic 3: Sample Messages for Review
immortality lolol
butt pucker state plan doodoo ahahahahahaha
world order mean shark pronounciation world orderlolojolorderlor
pokémon hunt plan ruby let scarlet violet
heart infowar alexjones everyones gubbment document take nutslapping supplement bidch shop discount code b1r percen

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[2-29-24] AiTelevision.csv

Topic 0: Sample Messages for Review
emo goth reveals earth
aitelevision request analysis
stash
analysis
request analysis

Topic 1: Sample Messages for Review
story replaces morty awe power show share advice morty cope
video speculum catch question morty beth question question question describes detail length size texture std
toilet song play backgroundmg milk sunsetory milk sunset
cyberpunk light britt style portrait photo cyborg robot chemical laboratory face face star painting artstation focus illustration art otomo katsuhiro shirō masamune oshii mamoru
channel point drunk call police

Topic 2: Sample Messages for Review
request analysis violence
encounter jasontheartist check masterpiece
story episode show hell host chef character tvmoviesnovelsvideogameshistoryanime character description ingredient struggle kitchen judge dish chef episode
request analysis violence
req

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[2-9-24] AiTelevision.csv

Topic 0: Sample Messages for Review
comedy roast stephen repeat joke roaster roaster use language roaster scientist end stephen roast partipants
story morty topic legit analysis question foot morty bringing counter argument counterpoint efficiency end source witness
svideo summer living whisk decides explanationlecture explanationlecture hev suit dangerhazards family member counter argument dialogue
reset story video svideo save imageq queue httpsdiscordggb3jsxhjnet youtube httpswwwyoutubecomplaylistlistpldnq2lbrvoaltduyrtpidhicqleaf5cfn
story img benjaminamoryjasontheartistjasontheartistjasontheartistjasontheartist video queue jasontheartist

Topic 1: Sample Messages for Review
folk context demographic
y mod rule wonit
analysis
analysis violence
request analysis violence

Topic 2: Sample Messages for Review
svideo summer land top sweaty man plummet stanky gaping describe 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-31-24] ask_jesus.csv

Topic 4: Sample Messages for Review
feel beer stream
right
chief sorry bother stream chance wine recipe please water thank love heyguys heyguys
apply moderator stream fill application form httpsformsgle3qujcwhywnbthn866

Topic 12: Sample Messages for Review
stop spamming wordsphrases
holy day reception room mad methane pig factory realitybit tha sanhedrin
death day
_ fur kitty sleepy purr
lunch stop bath room eat wherever eat taco supremes let food peace

Topic 8: Sample Messages for Review
thing world
hate people hate
hair advice look cool
thing
opinion skunk ape please brain cracker

Topic 6: Sample Messages for Review
chat
chat
chat
name soda name end name please wow way understand
night lord hope joy sinner chat

Topic 5: Sample Messages for Review
story adam style
childrens story easter bunny
blasphemy lord savior
i arm space story noah way please use replacement panenk

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-21-24] ask_jesus.csv

Topic 11: Sample Messages for Review
game god name
bit
hope answer mine
pizzeria thing wheelchair please explain work
mean truth hear answer

Topic 5: Sample Messages for Review
czechia thing ask geography teacher year thanks bee
ask riddle sphinx tell thank
church business vendor overcharge jewelry table guard mad
brother people death experience body experience think
room memory need code word ooga booga ooga googa

Topic 3: Sample Messages for Review
friend please space eehhhooflets_gooo brain polyp god rest ramen
explain support nonprofit project athene maker man
hey thanks story life story brain polyp understand hey
eat hamburger man word reply letter word ham rest word gur
jeucy please ghost please space shshshshshshsh response heart kumbaya

Topic 4: Sample Messages for Review
ghost please ghost please space aaghtaaaoogogoadsh response heart ramen jesua
ramen brother
q

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>