In [1]:
!pip install git+https://github.com/rwalk/gsdmm.git
!pip install nltk spacy gensim pyLDAvis networkx
!pip install wordcloud

import os
import pandas as pd
import numpy as np
import re
import random
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
import nltk
import spacy
import gensim
from gsdmm import MovieGroupProcess
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models as gensimvis
import networkx as nx
from collections import Counter
from itertools import combinations

# Load Spacy's English model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")

# Mount Google Drive to access the dataset stored there
from google.colab import drive
drive.mount('/content/drive')

# Download necessary NLTK resources for tokenization and POS tagging
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

general_stopwords = {
    'would', 'could', 'should', 'might', 'must', 'will', 'shall', 'can', 'may', 'wouldn', 'couldn', 'shouldn',
    'mightn', 'mustn', 'won', 'shan', 'can', 'mayn', 'just', 'don', 'didn', 'doesn', 'aren', 'isn', 'wasn',
    'weren', 'hasn', 'haven', 'hadn', 'does', 'did', 'don', 'does', 'did', 'don', 'now', 'then', 'once',
    'after', 'before', 'since', 'during', 'while', 'until', 'ago', 'yet', 'still', 'even', 'ever', 'always',
    'never', 'sometimes', 'often', 'usually', 'again', 'too', 'also', 'only', 'really', 'very', 'much',
    'more', 'most', 'many', 'several', 'few', 'some', 'any', 'each', 'every', 'all', 'both', 'either',
    'neither', 'anyone', 'everyone', 'someone', 'nobody', 'noone', 'nothing', 'anything', 'something',
    'everything', 'another', 'such', 'one', 'two', 'three', 'first', 'second', 'third', 'next', 'last',
    'same', 'other', 'different', 'new', 'old', 'young', 'long', 'short', 'high', 'low', 'large', 'small'
}

# Combine all sets of stop words
stop_words = set(stopwords.words('english')) | general_stopwords

# Define the preprocessing function
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    # Remove all non-word characters and lower the text
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Split text into tokens
    tokens = text.split()
    # Apply NER to remove usernames and named entities
    doc = nlp(' '.join(tokens))
    tokens = [token.text for token in doc if not token.ent_type_]
    # POS 的位置提前了，本来在最后
    # Retain only nouns as they are significant for topic identification
    nouns = [word for word, pos in pos_tag(tokens) if pos.startswith('NN')]
    # 筛选本来 lemmatize 和 len(word) > 2 是在同一行的，有可能 lemmatize 之后 word length 小于 2，所以这里把长度筛选放在后面了
    # Lemmatize tokens and remove stop words and short words
    nouns = [lemmatizer.lemmatize(word) for word in nouns]
    nouns = [word for word in nouns if word not in stop_words and len(word) > 2]
    return nouns

# Function to load and preprocess data from CSV files
def load_data(file_path):
    df = pd.read_csv(file_path)
    df['preprocessed'] = df['Message'].apply(preprocess)
    return df['preprocessed'].tolist()

# Path to the folder containing the dataset
folder_path = '/content/drive/My Drive/Twitch dataset/Cleaned data/'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

# Function to randomly select messages from each topic for manual review
def random_message_selection(docs, cluster_assignments, num_samples=5):
    topics = set(cluster_assignments)
    for topic_id in topics:
        topic_docs = [doc for i, doc in enumerate(docs) if cluster_assignments[i] == topic_id]
        sampled_messages = random.sample(topic_docs, min(len(topic_docs), num_samples))
        print(f"\nTopic {topic_id}: Sample Messages for Review")
        for message in sampled_messages:
            print(' '.join(message))

# Function to save plots to a PDF
from matplotlib.backends.backend_pdf import PdfPages

def save_plots_to_pdf(file_path, cluster_word_distribution, top_clusters):
    pdf_file_name = f'{os.path.basename(file_path)}_report.pdf'
    with PdfPages(pdf_file_name) as pdf:
        for topic_id in top_clusters:
            # Word cloud
            topic_words = cluster_word_distribution[topic_id]
            wordcloud = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(topic_words)
            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.title(f"Topic {topic_id} Word Cloud for {os.path.basename(file_path)}")
            plt.axis("off")
            pdf.savefig()
            plt.close()

            # Bar chart
            sorted_topic_words = dict(sorted(topic_words.items(), key=lambda item: item[1], reverse=True)[:10])
            plt.figure(figsize=(10, 5))
            plt.bar(sorted_topic_words.keys(), sorted_topic_words.values())
            plt.title(f"Top Words in Topic {topic_id}")
            plt.xticks(rotation=45)
            pdf.savefig()
            plt.close()

            # Network graph
            words = list(sorted_topic_words.keys())
            word_pairs = list(combinations(words, 2))
            G = nx.Graph()
            G.add_edges_from(word_pairs)
            pos = nx.spring_layout(G)
            plt.figure(figsize=(10, 5))
            nx.draw(G, pos, with_labels=True, node_size=50, font_size=10, edge_color='grey')
            plt.title(f"Word Co-occurrence Network for Topic {topic_id}")
            pdf.savefig()
            plt.close()

    return pdf_file_name

# Process each file, perform topic modeling, and generate visualizations
for file_path in file_paths:
    print(f"Processing file: {file_path}")
    docs = load_data(file_path)
    # Remove any empty documents resulting from preprocessing
    docs = [doc for doc in docs if doc]

    if len(docs) > 0:
        # Create a dictionary of all words in documents
        dictionary = gensim.corpora.Dictionary(docs)
        # Remove extremes to focus on relevant words only
        dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=10000)
        vocab_length = len(dictionary)

        if vocab_length == 0:
            print("No vocabulary available after filtering extremes. Skipping.")
            continue

        # Initialize and fit GSDMM model
        mgp = MovieGroupProcess(K=20, alpha=0.1, beta=0.3, n_iters=30)
        y = mgp.fit(docs, vocab_length)

        # Get cluster assignments and cluster word distributions
        cluster_assignments = y
        cluster_word_distribution = mgp.cluster_word_distribution

        # Random message selection for validation
        random_message_selection(docs, cluster_assignments)

        # Generate and display word clouds for each of the top 5 topics
        sorted_clusters = sorted(range(len(mgp.cluster_doc_count)), key=lambda k: mgp.cluster_doc_count[k], reverse=True)
        top_clusters = sorted_clusters[:5]

        # Save plots to PDF
        pdf_file_name = save_plots_to_pdf(file_path, cluster_word_distribution, top_clusters)

        # Provide a download link for the PDF file
        from google.colab import files
        files.download(pdf_file_name)















Collecting git+https://github.com/rwalk/gsdmm.git
  Cloning https://github.com/rwalk/gsdmm.git to /tmp/pip-req-build-07g2f95x
  Running command git clone --filter=blob:none --quiet https://github.com/rwalk/gsdmm.git /tmp/pip-req-build-07g2f95x
  Resolved https://github.com/rwalk/gsdmm.git to commit 4ad1b6b6976743681ee4976b4573463d359214ee
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gsdmm
  Building wheel for gsdmm (setup.py) ... [?25l[?25hdone
  Created wheel for gsdmm: filename=gsdmm-0.1-py3-none-any.whl size=4586 sha256=b3c4f59d7ab360b8f7dbecba4dc2239cf144868022fafb4ccf144456d9645916
  Stored in directory: /tmp/pip-ephem-wheel-cache-yacxqtuz/wheels/da/d3/6e/a612d7cff0fcfb6470b8c113fc04931ecffb466ac19b9c5f3c
Successfully built gsdmm
Installing collected packages: gsdmm
Successfully installed gsdmm-0.1
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading func

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-26-24] vedal987.csv
In stage 0: transferred 6735 clusters with 20 clusters populated
In stage 1: transferred 4976 clusters with 20 clusters populated
In stage 2: transferred 4186 clusters with 20 clusters populated
In stage 3: transferred 3846 clusters with 20 clusters populated
In stage 4: transferred 3575 clusters with 20 clusters populated
In stage 5: transferred 3447 clusters with 20 clusters populated
In stage 6: transferred 3306 clusters with 20 clusters populated
In stage 7: transferred 3176 clusters with 20 clusters populated
In stage 8: transferred 3176 clusters with 20 clusters populated
In stage 9: transferred 3106 clusters with 20 clusters populated
In stage 10: transferred 3043 clusters with 20 clusters populated
In stage 11: transferred 3042 clusters with 20 clusters populated
In stage 12: transferred 3042 clusters with 20 clusters populated
In stage 13: transferred 3030 clusters wi

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[1-5-24] vedal987.csv
In stage 0: transferred 637 clusters with 20 clusters populated
In stage 1: transferred 406 clusters with 20 clusters populated
In stage 2: transferred 368 clusters with 20 clusters populated
In stage 3: transferred 356 clusters with 20 clusters populated
In stage 4: transferred 355 clusters with 20 clusters populated
In stage 5: transferred 343 clusters with 20 clusters populated
In stage 6: transferred 357 clusters with 20 clusters populated
In stage 7: transferred 372 clusters with 20 clusters populated
In stage 8: transferred 360 clusters with 20 clusters populated
In stage 9: transferred 370 clusters with 20 clusters populated
In stage 10: transferred 360 clusters with 20 clusters populated
In stage 11: transferred 372 clusters with 20 clusters populated
In stage 12: transferred 365 clusters with 20 clusters populated
In stage 13: transferred 362 clusters with 20 clusters 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[2-3-24] AI_RacingTV.csv
In stage 0: transferred 2033 clusters with 20 clusters populated
In stage 1: transferred 1489 clusters with 20 clusters populated
In stage 2: transferred 985 clusters with 20 clusters populated
In stage 3: transferred 676 clusters with 20 clusters populated
In stage 4: transferred 596 clusters with 20 clusters populated
In stage 5: transferred 548 clusters with 20 clusters populated
In stage 6: transferred 537 clusters with 20 clusters populated
In stage 7: transferred 550 clusters with 20 clusters populated
In stage 8: transferred 543 clusters with 20 clusters populated
In stage 9: transferred 553 clusters with 20 clusters populated
In stage 10: transferred 535 clusters with 20 clusters populated
In stage 11: transferred 511 clusters with 20 clusters populated
In stage 12: transferred 495 clusters with 20 clusters populated
In stage 13: transferred 514 clusters with 20 clus

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-31-24] AI_RacingTV.csv
In stage 0: transferred 2732 clusters with 20 clusters populated
In stage 1: transferred 1798 clusters with 20 clusters populated
In stage 2: transferred 1319 clusters with 20 clusters populated
In stage 3: transferred 1138 clusters with 20 clusters populated
In stage 4: transferred 1045 clusters with 20 clusters populated
In stage 5: transferred 1008 clusters with 20 clusters populated
In stage 6: transferred 977 clusters with 20 clusters populated
In stage 7: transferred 960 clusters with 20 clusters populated
In stage 8: transferred 908 clusters with 20 clusters populated
In stage 9: transferred 908 clusters with 20 clusters populated
In stage 10: transferred 900 clusters with 20 clusters populated
In stage 11: transferred 903 clusters with 20 clusters populated
In stage 12: transferred 913 clusters with 20 clusters populated
In stage 13: transferred 906 clusters with 20

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-21-24] TrumpOrBiden2024.csv
In stage 0: transferred 4639 clusters with 20 clusters populated
In stage 1: transferred 3287 clusters with 20 clusters populated
In stage 2: transferred 2601 clusters with 20 clusters populated
In stage 3: transferred 2444 clusters with 20 clusters populated
In stage 4: transferred 2388 clusters with 20 clusters populated
In stage 5: transferred 2359 clusters with 20 clusters populated
In stage 6: transferred 2341 clusters with 20 clusters populated
In stage 7: transferred 2286 clusters with 20 clusters populated
In stage 8: transferred 2292 clusters with 20 clusters populated
In stage 9: transferred 2310 clusters with 20 clusters populated
In stage 10: transferred 2271 clusters with 20 clusters populated
In stage 11: transferred 2294 clusters with 20 clusters populated
In stage 12: transferred 2285 clusters with 20 clusters populated
In stage 13: transferred 2248 clu

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-31-24] TrumpOrBiden2024.csv
In stage 0: transferred 5918 clusters with 20 clusters populated
In stage 1: transferred 4266 clusters with 20 clusters populated
In stage 2: transferred 3507 clusters with 20 clusters populated
In stage 3: transferred 3272 clusters with 20 clusters populated
In stage 4: transferred 3185 clusters with 20 clusters populated
In stage 5: transferred 3188 clusters with 20 clusters populated
In stage 6: transferred 3175 clusters with 20 clusters populated
In stage 7: transferred 3110 clusters with 20 clusters populated
In stage 8: transferred 3090 clusters with 20 clusters populated
In stage 9: transferred 3078 clusters with 20 clusters populated
In stage 10: transferred 3044 clusters with 20 clusters populated
In stage 11: transferred 3061 clusters with 20 clusters populated
In stage 12: transferred 3033 clusters with 20 clusters populated
In stage 13: transferred 3054 clu

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[4-1-24] AtheneAIHeroes.csv
In stage 0: transferred 4038 clusters with 20 clusters populated
In stage 1: transferred 2732 clusters with 20 clusters populated
In stage 2: transferred 2300 clusters with 20 clusters populated
In stage 3: transferred 2183 clusters with 20 clusters populated
In stage 4: transferred 2091 clusters with 20 clusters populated
In stage 5: transferred 2033 clusters with 20 clusters populated
In stage 6: transferred 1988 clusters with 20 clusters populated
In stage 7: transferred 1999 clusters with 20 clusters populated
In stage 8: transferred 1977 clusters with 20 clusters populated
In stage 9: transferred 1985 clusters with 20 clusters populated
In stage 10: transferred 1962 clusters with 20 clusters populated
In stage 11: transferred 1925 clusters with 20 clusters populated
In stage 12: transferred 1939 clusters with 20 clusters populated
In stage 13: transferred 1927 cluste

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-21-24] AtheneAIHeroes.csv
In stage 0: transferred 3434 clusters with 20 clusters populated
In stage 1: transferred 2240 clusters with 20 clusters populated
In stage 2: transferred 1881 clusters with 20 clusters populated
In stage 3: transferred 1762 clusters with 20 clusters populated
In stage 4: transferred 1660 clusters with 20 clusters populated
In stage 5: transferred 1662 clusters with 20 clusters populated
In stage 6: transferred 1627 clusters with 20 clusters populated
In stage 7: transferred 1621 clusters with 20 clusters populated
In stage 8: transferred 1582 clusters with 20 clusters populated
In stage 9: transferred 1496 clusters with 20 clusters populated
In stage 10: transferred 1515 clusters with 20 clusters populated
In stage 11: transferred 1482 clusters with 20 clusters populated
In stage 12: transferred 1467 clusters with 20 clusters populated
In stage 13: transferred 1452 clust

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[2-29-24] AiTelevision.csv
In stage 0: transferred 622 clusters with 20 clusters populated
In stage 1: transferred 209 clusters with 20 clusters populated
In stage 2: transferred 119 clusters with 20 clusters populated
In stage 3: transferred 109 clusters with 20 clusters populated
In stage 4: transferred 110 clusters with 20 clusters populated
In stage 5: transferred 118 clusters with 20 clusters populated
In stage 6: transferred 110 clusters with 20 clusters populated
In stage 7: transferred 112 clusters with 20 clusters populated
In stage 8: transferred 101 clusters with 20 clusters populated
In stage 9: transferred 108 clusters with 20 clusters populated
In stage 10: transferred 107 clusters with 20 clusters populated
In stage 11: transferred 105 clusters with 20 clusters populated
In stage 12: transferred 109 clusters with 20 clusters populated
In stage 13: transferred 95 clusters with 20 clust

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[2-9-24] AiTelevision.csv
In stage 0: transferred 2033 clusters with 20 clusters populated
In stage 1: transferred 750 clusters with 20 clusters populated
In stage 2: transferred 452 clusters with 20 clusters populated
In stage 3: transferred 418 clusters with 20 clusters populated
In stage 4: transferred 427 clusters with 20 clusters populated
In stage 5: transferred 416 clusters with 20 clusters populated
In stage 6: transferred 418 clusters with 20 clusters populated
In stage 7: transferred 436 clusters with 20 clusters populated
In stage 8: transferred 438 clusters with 20 clusters populated
In stage 9: transferred 431 clusters with 20 clusters populated
In stage 10: transferred 425 clusters with 20 clusters populated
In stage 11: transferred 410 clusters with 20 clusters populated
In stage 12: transferred 423 clusters with 20 clusters populated
In stage 13: transferred 423 clusters with 20 clus

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-31-24] ask_jesus.csv
In stage 0: transferred 3459 clusters with 20 clusters populated
In stage 1: transferred 2452 clusters with 20 clusters populated
In stage 2: transferred 1981 clusters with 20 clusters populated
In stage 3: transferred 1884 clusters with 20 clusters populated
In stage 4: transferred 1806 clusters with 20 clusters populated
In stage 5: transferred 1778 clusters with 20 clusters populated
In stage 6: transferred 1781 clusters with 20 clusters populated
In stage 7: transferred 1761 clusters with 20 clusters populated
In stage 8: transferred 1724 clusters with 20 clusters populated
In stage 9: transferred 1740 clusters with 20 clusters populated
In stage 10: transferred 1736 clusters with 20 clusters populated
In stage 11: transferred 1680 clusters with 20 clusters populated
In stage 12: transferred 1649 clusters with 20 clusters populated
In stage 13: transferred 1659 clusters w

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Twitch dataset/Cleaned data/cleaned_anonymized_[3-21-24] ask_jesus.csv
In stage 0: transferred 3627 clusters with 20 clusters populated
In stage 1: transferred 2583 clusters with 20 clusters populated
In stage 2: transferred 2229 clusters with 20 clusters populated
In stage 3: transferred 2110 clusters with 20 clusters populated
In stage 4: transferred 2039 clusters with 20 clusters populated
In stage 5: transferred 2001 clusters with 20 clusters populated
In stage 6: transferred 1963 clusters with 20 clusters populated
In stage 7: transferred 1901 clusters with 20 clusters populated
In stage 8: transferred 1857 clusters with 20 clusters populated
In stage 9: transferred 1872 clusters with 20 clusters populated
In stage 10: transferred 1918 clusters with 20 clusters populated
In stage 11: transferred 1911 clusters with 20 clusters populated
In stage 12: transferred 1887 clusters with 20 clusters populated
In stage 13: transferred 1886 clusters w

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>