In [3]:
!pip install git+https://github.com/rwalk/gsdmm.git
!pip install nltk spacy gensim pyLDAvis networkx
!pip install wordcloud

import os
import pandas as pd
import numpy as np
import re
import random
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
import nltk
import spacy
import gensim
from gsdmm import MovieGroupProcess
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models as gensimvis
import networkx as nx
from collections import Counter
from itertools import combinations

# Load Spacy's English model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")

# Mount Google Drive to access the dataset stored there
from google.colab import drive
drive.mount('/content/drive')

# Download necessary NLTK resources for tokenization and POS tagging
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Enhance the stop words list with terms commonly irrelevant in casual discussions
additional_stopwords = {'get', 'okay', 'oh', 'yeah', 'hey', 'hi', 'please', 'thank', 'welcome', 'like', 'just', 'know', 'really', 'thing', 'things', 'said', 'also', 'one', 'use', 'using', 'used'}

# Context-specific terms related to AI-generated content discussions
contextual_stopwords = {'help', 'think', 'probably', 'topic', 'question', 'ask', 'need'}

# AI-generated content specific terms
aigc_stopwords = {'ai', 'artificial', 'intelligence', 'generated', 'generate', 'generation', 'content', 'image', 'images', 'video', 'videos', 'picture', 'pictures', 'photo', 'photos', 'graphic', 'graphics', 'stable', 'diffusion', 'model', 'models'}

# Platform-specific terms related to Twitch discussions
twitch_stopwords = {'vedal987', 'racingtv', 'ask', 'jesus', 'trump', 'biden', '2024', 'athene', 'aiheroes', 'aitelevision', 'stream', 'streaming', 'live', 'channel', 'twitch', 'sub', 'subscribe', 'follow', 'followers', 'chat', 'chats', 'bot', 'bots', 'mod', 'mods', 'moderator', 'moderators', 'emote', 'emotes', 'badge', 'badges', 'hype', 'raid', 'raids', 'clip', 'clips', 'vod', 'vods'}

# Combine all sets of stop words
stop_words = set(stopwords.words('english')) | additional_stopwords | aigc_stopwords | contextual_stopwords | twitch_stopwords

# Define the preprocessing function
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    # Remove all non-word characters and lower the text
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Split text into tokens
    tokens = text.split()
    # Apply NER to remove usernames and named entities
    doc = nlp(' '.join(tokens))
    tokens = [token.text for token in doc if not token.ent_type_]
    # POS 的位置提前了，本来在最后
    # Retain only nouns as they are significant for topic identification
    nouns = [word for word, pos in pos_tag(tokens) if pos.startswith('NN')]
    # 筛选本来 lemmatize 和 len(word) > 2 是在同一行的，有可能 lemmatize 之后 word length 小于 2，所以这里把长度筛选放在后面了
    # Lemmatize tokens and remove stop words and short words
    nouns = [lemmatizer.lemmatize(word) for word in nouns]
    nouns = [word for word in nouns if word not in stop_words and len(word) > 2]
    return nouns

# Function to load and preprocess data from CSV files
def load_data(file_path):
    df = pd.read_csv(file_path)
    df['preprocessed'] = df['Message'].apply(preprocess)
    return df['preprocessed'].tolist()

# Path to the folder containing the dataset
folder_path = '/content/drive/My Drive/Twitch dataset/Cleaned data/'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

# Function to randomly select messages from each topic for manual review
def random_message_selection(docs, cluster_assignments, num_samples=5):
    topics = set(cluster_assignments)
    for topic_id in topics:
        topic_docs = [doc for i, doc in enumerate(docs) if cluster_assignments[i] == topic_id]
        sampled_messages = random.sample(topic_docs, min(len(topic_docs), num_samples))
        print(f"\nTopic {topic_id}: Sample Messages for Review")
        for message in sampled_messages:
            print(' '.join(message))

# Function to save plots to a PDF
from matplotlib.backends.backend_pdf import PdfPages

def save_plots_to_pdf(file_path, cluster_word_distribution, top_clusters):
    pdf_file_name = f'{os.path.basename(file_path)}_report.pdf'
    with PdfPages(pdf_file_name) as pdf:
        for topic_id in top_clusters:
            # Word cloud
            topic_words = cluster_word_distribution[topic_id]
            wordcloud = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(topic_words)
            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.title(f"Topic {topic_id} Word Cloud for {os.path.basename(file_path)}")
            plt.axis("off")
            pdf.savefig()
            plt.close()

            # Bar chart
            sorted_topic_words = dict(sorted(topic_words.items(), key=lambda item: item[1], reverse=True)[:10])
            plt.figure(figsize=(10, 5))
            plt.bar(sorted_topic_words.keys(), sorted_topic_words.values())
            plt.title(f"Top Words in Topic {topic_id}")
            plt.xticks(rotation=45)
            pdf.savefig()
            plt.close()

            # Network graph
            words = list(sorted_topic_words.keys())
            word_pairs = list(combinations(words, 2))
            G = nx.Graph()
            G.add_edges_from(word_pairs)
            pos = nx.spring_layout(G)
            plt.figure(figsize=(10, 5))
            nx.draw(G, pos, with_labels=True, node_size=50, font_size=10, edge_color='grey')
            plt.title(f"Word Co-occurrence Network for Topic {topic_id}")
            pdf.savefig()
            plt.close()

    return pdf_file_name

# Process each file, perform topic modeling, and generate visualizations
for file_path in file_paths:
    print(f"Processing file: {file_path}")
    docs = load_data(file_path)
    # Remove any empty documents resulting from preprocessing
    docs = [doc for doc in docs if doc]

    if len(docs) > 0:
        # Create a dictionary of all words in documents
        dictionary = gensim.corpora.Dictionary(docs)
        # Remove extremes to focus on relevant words only
        dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=10000)
        vocab_length = len(dictionary)

        if vocab_length == 0:
            print("No vocabulary available after filtering extremes. Skipping.")
            continue

        # Initialize and fit GSDMM model
        mgp = MovieGroupProcess(K=20, alpha=0.1, beta=0.3, n_iters=30)
        y = mgp.fit(docs, vocab_length)

        # Get cluster assignments and cluster word distributions
        cluster_assignments = y
        cluster_word_distribution = mgp.cluster_word_distribution

        # Random message selection for validation
        random_message_selection(docs, cluster_assignments)

        # Generate and display word clouds for each of the top 5 topics
        sorted_clusters = sorted(range(len(mgp.cluster_doc_count)), key=lambda k: mgp.cluster_doc_count[k], reverse=True)
        top_clusters = sorted_clusters[:5]

        # Save plots to PDF
        pdf_file_name = save_plots_to_pdf(file_path, cluster_word_distribution, top_clusters)

        # Provide a download link for the PDF file
        from google.colab import files
        files.download(pdf_file_name)













  and should_run_async(code)


Collecting git+https://github.com/rwalk/gsdmm.git
  Cloning https://github.com/rwalk/gsdmm.git to /tmp/pip-req-build-8ztbwyx1
  Running command git clone --filter=blob:none --quiet https://github.com/rwalk/gsdmm.git /tmp/pip-req-build-8ztbwyx1
  Resolved https://github.com/rwalk/gsdmm.git to commit 4ad1b6b6976743681ee4976b4573463d359214ee
  Preparing metadata (setup.py) ... [?25l[?25hdone
Mounted at /content/drive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-26-24] vedal987.csv
In stage 0: transferred 6582 clusters with 20 clusters populated
In stage 1: transferred 4940 clusters with 20 clusters populated
In stage 2: transferred 3990 clusters with 20 clusters populated
In stage 3: transferred 3658 clusters with 20 clusters populated
In stage 4: transferred 3439 clusters with 20 clusters populated
In stage 5: transferred 3375 clusters with 20 clusters populated
In stage 6: transferred 3296 clusters with 20 clusters populated
In stage 7: transferred 3095 clusters with 20 clusters populated
In stage 8: transferred 2973 clusters with 20 clusters populated
In stage 9: transferred 2954 clusters with 20 clusters populated
In stage 10: transferred 2927 clusters with 20 clusters populated
In stage 11: transferred 2895 clusters with 20 clusters populated
In stage 12: transferred 2838 clusters with 20 clusters populated
In stage 13: transferred 2804 clusters wi

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[1-5-24] vedal987.csv
In stage 0: transferred 608 clusters with 20 clusters populated
In stage 1: transferred 417 clusters with 20 clusters populated
In stage 2: transferred 385 clusters with 20 clusters populated
In stage 3: transferred 354 clusters with 20 clusters populated
In stage 4: transferred 369 clusters with 20 clusters populated
In stage 5: transferred 352 clusters with 20 clusters populated
In stage 6: transferred 354 clusters with 20 clusters populated
In stage 7: transferred 345 clusters with 20 clusters populated
In stage 8: transferred 344 clusters with 20 clusters populated
In stage 9: transferred 359 clusters with 20 clusters populated
In stage 10: transferred 368 clusters with 20 clusters populated
In stage 11: transferred 378 clusters with 20 clusters populated
In stage 12: transferred 391 clusters with 20 clusters populated
In stage 13: transferred 353 clusters with 20 clusters 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[2-3-24] AI_RacingTV.csv
In stage 0: transferred 2022 clusters with 20 clusters populated
In stage 1: transferred 1390 clusters with 20 clusters populated
In stage 2: transferred 993 clusters with 20 clusters populated
In stage 3: transferred 821 clusters with 20 clusters populated
In stage 4: transferred 738 clusters with 20 clusters populated
In stage 5: transferred 658 clusters with 20 clusters populated
In stage 6: transferred 615 clusters with 20 clusters populated
In stage 7: transferred 570 clusters with 20 clusters populated
In stage 8: transferred 574 clusters with 20 clusters populated
In stage 9: transferred 543 clusters with 20 clusters populated
In stage 10: transferred 528 clusters with 20 clusters populated
In stage 11: transferred 508 clusters with 20 clusters populated
In stage 12: transferred 505 clusters with 20 clusters populated
In stage 13: transferred 503 clusters with 20 clus

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-31-24] AI_RacingTV.csv
In stage 0: transferred 2716 clusters with 20 clusters populated
In stage 1: transferred 1857 clusters with 20 clusters populated
In stage 2: transferred 1395 clusters with 20 clusters populated
In stage 3: transferred 1210 clusters with 20 clusters populated
In stage 4: transferred 1155 clusters with 20 clusters populated
In stage 5: transferred 1101 clusters with 20 clusters populated
In stage 6: transferred 1060 clusters with 20 clusters populated
In stage 7: transferred 1011 clusters with 20 clusters populated
In stage 8: transferred 1007 clusters with 20 clusters populated
In stage 9: transferred 987 clusters with 20 clusters populated
In stage 10: transferred 959 clusters with 20 clusters populated
In stage 11: transferred 977 clusters with 20 clusters populated
In stage 12: transferred 976 clusters with 20 clusters populated
In stage 13: transferred 990 clusters with

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-21-24] TrumpOrBiden2024.csv
In stage 0: transferred 4209 clusters with 20 clusters populated
In stage 1: transferred 3173 clusters with 20 clusters populated
In stage 2: transferred 2716 clusters with 20 clusters populated
In stage 3: transferred 2531 clusters with 20 clusters populated
In stage 4: transferred 2494 clusters with 20 clusters populated
In stage 5: transferred 2414 clusters with 20 clusters populated
In stage 6: transferred 2369 clusters with 20 clusters populated
In stage 7: transferred 2342 clusters with 20 clusters populated
In stage 8: transferred 2267 clusters with 20 clusters populated
In stage 9: transferred 2224 clusters with 20 clusters populated
In stage 10: transferred 2242 clusters with 20 clusters populated
In stage 11: transferred 2271 clusters with 20 clusters populated
In stage 12: transferred 2291 clusters with 20 clusters populated
In stage 13: transferred 2255 clu

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-31-24] TrumpOrBiden2024.csv
In stage 0: transferred 5435 clusters with 20 clusters populated
In stage 1: transferred 4035 clusters with 20 clusters populated
In stage 2: transferred 3611 clusters with 20 clusters populated
In stage 3: transferred 3381 clusters with 20 clusters populated
In stage 4: transferred 3315 clusters with 20 clusters populated
In stage 5: transferred 3308 clusters with 20 clusters populated
In stage 6: transferred 3248 clusters with 20 clusters populated
In stage 7: transferred 3179 clusters with 20 clusters populated
In stage 8: transferred 3133 clusters with 20 clusters populated
In stage 9: transferred 3117 clusters with 20 clusters populated
In stage 10: transferred 3125 clusters with 20 clusters populated
In stage 11: transferred 3046 clusters with 20 clusters populated
In stage 12: transferred 3062 clusters with 20 clusters populated
In stage 13: transferred 2984 clu

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[4-1-24] AtheneAIHeroes.csv
In stage 0: transferred 3954 clusters with 20 clusters populated
In stage 1: transferred 2649 clusters with 20 clusters populated
In stage 2: transferred 2261 clusters with 20 clusters populated
In stage 3: transferred 2165 clusters with 20 clusters populated
In stage 4: transferred 2152 clusters with 20 clusters populated
In stage 5: transferred 2068 clusters with 20 clusters populated
In stage 6: transferred 1987 clusters with 20 clusters populated
In stage 7: transferred 1970 clusters with 20 clusters populated
In stage 8: transferred 1967 clusters with 20 clusters populated
In stage 9: transferred 1969 clusters with 20 clusters populated
In stage 10: transferred 1929 clusters with 20 clusters populated
In stage 11: transferred 1887 clusters with 20 clusters populated
In stage 12: transferred 1869 clusters with 20 clusters populated
In stage 13: transferred 1871 cluste

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-21-24] AtheneAIHeroes.csv
In stage 0: transferred 3425 clusters with 20 clusters populated
In stage 1: transferred 2197 clusters with 20 clusters populated
In stage 2: transferred 1871 clusters with 20 clusters populated
In stage 3: transferred 1784 clusters with 20 clusters populated
In stage 4: transferred 1680 clusters with 20 clusters populated
In stage 5: transferred 1661 clusters with 20 clusters populated
In stage 6: transferred 1669 clusters with 20 clusters populated
In stage 7: transferred 1590 clusters with 20 clusters populated
In stage 8: transferred 1562 clusters with 20 clusters populated
In stage 9: transferred 1552 clusters with 20 clusters populated
In stage 10: transferred 1541 clusters with 20 clusters populated
In stage 11: transferred 1543 clusters with 20 clusters populated
In stage 12: transferred 1535 clusters with 20 clusters populated
In stage 13: transferred 1454 clust

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[2-29-24] AiTelevision.csv
In stage 0: transferred 615 clusters with 20 clusters populated
In stage 1: transferred 234 clusters with 20 clusters populated
In stage 2: transferred 127 clusters with 20 clusters populated
In stage 3: transferred 123 clusters with 20 clusters populated
In stage 4: transferred 121 clusters with 20 clusters populated
In stage 5: transferred 101 clusters with 20 clusters populated
In stage 6: transferred 102 clusters with 20 clusters populated
In stage 7: transferred 102 clusters with 20 clusters populated
In stage 8: transferred 109 clusters with 20 clusters populated
In stage 9: transferred 112 clusters with 20 clusters populated
In stage 10: transferred 112 clusters with 20 clusters populated
In stage 11: transferred 108 clusters with 20 clusters populated
In stage 12: transferred 103 clusters with 20 clusters populated
In stage 13: transferred 96 clusters with 20 clust

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[2-9-24] AiTelevision.csv
In stage 0: transferred 2028 clusters with 20 clusters populated
In stage 1: transferred 835 clusters with 20 clusters populated
In stage 2: transferred 466 clusters with 20 clusters populated
In stage 3: transferred 441 clusters with 20 clusters populated
In stage 4: transferred 442 clusters with 20 clusters populated
In stage 5: transferred 440 clusters with 20 clusters populated
In stage 6: transferred 434 clusters with 20 clusters populated
In stage 7: transferred 420 clusters with 20 clusters populated
In stage 8: transferred 430 clusters with 20 clusters populated
In stage 9: transferred 418 clusters with 20 clusters populated
In stage 10: transferred 412 clusters with 20 clusters populated
In stage 11: transferred 404 clusters with 20 clusters populated
In stage 12: transferred 419 clusters with 20 clusters populated
In stage 13: transferred 422 clusters with 20 clus

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-31-24] ask_jesus.csv
In stage 0: transferred 3249 clusters with 20 clusters populated
In stage 1: transferred 2221 clusters with 20 clusters populated
In stage 2: transferred 1862 clusters with 20 clusters populated
In stage 3: transferred 1773 clusters with 20 clusters populated
In stage 4: transferred 1686 clusters with 20 clusters populated
In stage 5: transferred 1684 clusters with 20 clusters populated
In stage 6: transferred 1639 clusters with 20 clusters populated
In stage 7: transferred 1668 clusters with 20 clusters populated
In stage 8: transferred 1619 clusters with 20 clusters populated
In stage 9: transferred 1659 clusters with 20 clusters populated
In stage 10: transferred 1635 clusters with 20 clusters populated
In stage 11: transferred 1641 clusters with 20 clusters populated
In stage 12: transferred 1644 clusters with 20 clusters populated
In stage 13: transferred 1634 clusters w

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-21-24] ask_jesus.csv
In stage 0: transferred 3440 clusters with 20 clusters populated
In stage 1: transferred 2468 clusters with 20 clusters populated
In stage 2: transferred 2151 clusters with 20 clusters populated
In stage 3: transferred 2020 clusters with 20 clusters populated
In stage 4: transferred 1950 clusters with 20 clusters populated
In stage 5: transferred 1947 clusters with 20 clusters populated
In stage 6: transferred 1938 clusters with 20 clusters populated
In stage 7: transferred 1932 clusters with 20 clusters populated
In stage 8: transferred 1916 clusters with 20 clusters populated
In stage 9: transferred 1943 clusters with 20 clusters populated
In stage 10: transferred 1913 clusters with 20 clusters populated
In stage 11: transferred 1887 clusters with 20 clusters populated
In stage 12: transferred 1876 clusters with 20 clusters populated
In stage 13: transferred 1901 clusters w

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>