In [1]:
!pip install git+https://github.com/rwalk/gsdmm.git
!pip install nltk spacy gensim pyLDAvis networkx
!pip install wordcloud

import os
import pandas as pd
import numpy as np
import re
import random
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
import nltk
import spacy
import gensim
from gsdmm import MovieGroupProcess
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models as gensimvis
import networkx as nx
from collections import Counter
from itertools import combinations

# Load Spacy's English model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")

# Mount Google Drive to access the dataset stored there
from google.colab import drive
drive.mount('/content/drive')

# Download necessary NLTK resources for tokenization and POS tagging
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

general_stopwords = {
    'would', 'could', 'should', 'might', 'must', 'will', 'shall', 'can', 'may', 'wouldn', 'couldn', 'shouldn',
    'mightn', 'mustn', 'won', 'shan', 'can', 'mayn', 'just', 'don', 'didn', 'doesn', 'aren', 'isn', 'wasn',
    'weren', 'hasn', 'haven', 'hadn', 'does', 'did', 'don', 'does', 'did', 'don', 'now', 'then', 'once',
    'after', 'before', 'since', 'during', 'while', 'until', 'ago', 'yet', 'still', 'even', 'ever', 'always',
    'never', 'sometimes', 'often', 'usually', 'again', 'too', 'also', 'only', 'really', 'very', 'much',
    'more', 'most', 'many', 'several', 'few', 'some', 'any', 'each', 'every', 'all', 'both', 'either',
    'neither', 'anyone', 'everyone', 'someone', 'nobody', 'noone', 'nothing', 'anything', 'something',
    'everything', 'another', 'such', 'one', 'two', 'three', 'first', 'second', 'third', 'next', 'last',
    'same', 'other', 'different', 'new', 'old', 'young', 'long', 'short', 'high', 'low', 'large', 'small'
}

# Combine all sets of stop words
stop_words = set(stopwords.words('english')) | general_stopwords

# Define the preprocessing function
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    # Remove all non-word characters and lower the text
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Split text into tokens
    tokens = text.split()
    # Apply NER to remove usernames and named entities
    doc = nlp(' '.join(tokens))
    tokens = [token.text for token in doc if not token.ent_type_]
    # POS ÁöÑ‰ΩçÁΩÆÊèêÂâç‰∫ÜÔºåÊú¨Êù•Âú®ÊúÄÂêé
    # Retain only nouns as they are significant for topic identification
    nouns = [word for word, pos in pos_tag(tokens) if pos.startswith('NN')]
    # Á≠õÈÄâÊú¨Êù• lemmatize Âíå len(word) > 2 ÊòØÂú®Âêå‰∏ÄË°åÁöÑÔºåÊúâÂèØËÉΩ lemmatize ‰πãÂêé word length Â∞è‰∫é 2ÔºåÊâÄ‰ª•ËøôÈáåÊääÈïøÂ∫¶Á≠õÈÄâÊîæÂú®ÂêéÈù¢‰∫Ü
    # Lemmatize tokens and remove stop words and short words
    nouns = [lemmatizer.lemmatize(word) for word in nouns]
    nouns = [word for word in nouns if word not in stop_words and len(word) > 2]
    return nouns

# Function to load and preprocess data from CSV files
def load_data(file_path):
    df = pd.read_csv(file_path)
    df['preprocessed'] = df['Content'].apply(preprocess)
    return df['preprocessed'].tolist()

# Path to the folder containing the dataset
folder_path = '/content/drive/My Drive/Discord dataset/Cleaned data/'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

# Function to randomly select messages from each topic for manual review
def random_message_selection(docs, cluster_assignments, num_samples=5):
    topics = set(cluster_assignments)
    for topic_id in topics:
        topic_docs = [doc for i, doc in enumerate(docs) if cluster_assignments[i] == topic_id]
        sampled_messages = random.sample(topic_docs, min(len(topic_docs), num_samples))
        print(f"\nTopic {topic_id}: Sample Messages for Review")
        for message in sampled_messages:
            print(' '.join(message))

# Function to save plots to a PDF
from matplotlib.backends.backend_pdf import PdfPages

def save_plots_to_pdf(file_path, cluster_word_distribution, top_clusters):
    pdf_file_name = f'{os.path.basename(file_path)}_report.pdf'
    with PdfPages(pdf_file_name) as pdf:
        for topic_id in top_clusters:
            # Word cloud
            topic_words = cluster_word_distribution[topic_id]
            wordcloud = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(topic_words)
            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.title(f"Topic {topic_id} Word Cloud for {os.path.basename(file_path)}")
            plt.axis("off")
            pdf.savefig()
            plt.close()

            # Bar chart
            sorted_topic_words = dict(sorted(topic_words.items(), key=lambda item: item[1], reverse=True)[:10])
            plt.figure(figsize=(10, 5))
            plt.bar(sorted_topic_words.keys(), sorted_topic_words.values())
            plt.title(f"Top Words in Topic {topic_id}")
            plt.xticks(rotation=45)
            pdf.savefig()
            plt.close()

            # Network graph
            words = list(sorted_topic_words.keys())
            word_pairs = list(combinations(words, 2))
            G = nx.Graph()
            G.add_edges_from(word_pairs)
            pos = nx.spring_layout(G)
            plt.figure(figsize=(10, 5))
            nx.draw(G, pos, with_labels=True, node_size=50, font_size=10, edge_color='grey')
            plt.title(f"Word Co-occurrence Network for Topic {topic_id}")
            pdf.savefig()
            plt.close()

    return pdf_file_name

# Process each file, perform topic modeling, and generate visualizations
for file_path in file_paths:
    print(f"Processing file: {file_path}")
    docs = load_data(file_path)
    # Remove any empty documents resulting from preprocessing
    docs = [doc for doc in docs if doc]

    if len(docs) > 0:
        # Create a dictionary of all words in documents
        dictionary = gensim.corpora.Dictionary(docs)
        # Remove extremes to focus on relevant words only
        dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=10000)
        vocab_length = len(dictionary)

        if vocab_length == 0:
            print("No vocabulary available after filtering extremes. Skipping.")
            continue

        # Initialize and fit GSDMM model
        mgp = MovieGroupProcess(K=20, alpha=0.1, beta=0.3, n_iters=30)
        y = mgp.fit(docs, vocab_length)

        # Get cluster assignments and cluster word distributions
        cluster_assignments = y
        cluster_word_distribution = mgp.cluster_word_distribution

        # Random message selection for validation
        random_message_selection(docs, cluster_assignments)

        # Generate and display word clouds for each of the top 5 topics
        sorted_clusters = sorted(range(len(mgp.cluster_doc_count)), key=lambda k: mgp.cluster_doc_count[k], reverse=True)
        top_clusters = sorted_clusters[:5]

        # Save plots to PDF
        pdf_file_name = save_plots_to_pdf(file_path, cluster_word_distribution, top_clusters)

        # Provide a download link for the PDF file
        from google.colab import files
        files.download(pdf_file_name)














Collecting git+https://github.com/rwalk/gsdmm.git
  Cloning https://github.com/rwalk/gsdmm.git to /tmp/pip-req-build-i0bymqci
  Running command git clone --filter=blob:none --quiet https://github.com/rwalk/gsdmm.git /tmp/pip-req-build-i0bymqci
  Resolved https://github.com/rwalk/gsdmm.git to commit 4ad1b6b6976743681ee4976b4573463d359214ee
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gsdmm
  Building wheel for gsdmm (setup.py) ... [?25l[?25hdone
  Created wheel for gsdmm: filename=gsdmm-0.1-py3-none-any.whl size=4586 sha256=86953b50deea9a78fe1cafba1e474461e188dfb5cb72007840b3729dae9b9271
  Stored in directory: /tmp/pip-ephem-wheel-cache-38vxg046/wheels/da/d3/6e/a612d7cff0fcfb6470b8c113fc04931ecffb466ac19b9c5f3c
Successfully built gsdmm
Installing collected packages: gsdmm
Successfully installed gsdmm-0.1
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading func

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_Midjourney - Chat - discussion [938713143759216720] (2022-01-01 to 2022-07-01)_anonymized.csv
In stage 0: transferred 55813 clusters with 20 clusters populated
In stage 1: transferred 50727 clusters with 20 clusters populated
In stage 2: transferred 46003 clusters with 20 clusters populated
In stage 3: transferred 41499 clusters with 20 clusters populated
In stage 4: transferred 38730 clusters with 20 clusters populated
In stage 5: transferred 37215 clusters with 20 clusters populated
In stage 6: transferred 35922 clusters with 20 clusters populated
In stage 7: transferred 35131 clusters with 20 clusters populated
In stage 8: transferred 34125 clusters with 20 clusters populated
In stage 9: transferred 33852 clusters with 20 clusters populated
In stage 10: transferred 33637 clusters with 20 clusters populated
In stage 11: transferred 33466 clusters with 20 clusters populated
In stage 12: transferred 33182 clu

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_PromptHero - PromptHero - üó®‚îÇgeneral [1030178486398234684] (2022-10-13 to 2023-01-01)_anonymized.csv
In stage 0: transferred 735 clusters with 20 clusters populated
In stage 1: transferred 623 clusters with 20 clusters populated
In stage 2: transferred 611 clusters with 20 clusters populated
In stage 3: transferred 564 clusters with 20 clusters populated
In stage 4: transferred 552 clusters with 20 clusters populated
In stage 5: transferred 560 clusters with 20 clusters populated
In stage 6: transferred 553 clusters with 20 clusters populated
In stage 7: transferred 569 clusters with 20 clusters populated
In stage 8: transferred 559 clusters with 20 clusters populated
In stage 9: transferred 561 clusters with 20 clusters populated
In stage 10: transferred 545 clusters with 20 clusters populated
In stage 11: transferred 557 clusters with 20 clusters populated
In stage 12: transferred 554 clusters with 20 c

  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_Maze Guru - AI Art, Anime & Social - Chat - üí¨ÔΩúenglish-discussion [1037291441434677268] (2024-01-01 to 2024-03-01)_anonymized.csv
In stage 0: transferred 315 clusters with 20 clusters populated
In stage 1: transferred 273 clusters with 20 clusters populated
In stage 2: transferred 259 clusters with 20 clusters populated
In stage 3: transferred 263 clusters with 20 clusters populated
In stage 4: transferred 253 clusters with 20 clusters populated
In stage 5: transferred 264 clusters with 20 clusters populated
In stage 6: transferred 256 clusters with 20 clusters populated
In stage 7: transferred 246 clusters with 20 clusters populated
In stage 8: transferred 261 clusters with 20 clusters populated
In stage 9: transferred 251 clusters with 20 clusters populated
In stage 10: transferred 258 clusters with 20 clusters populated
In stage 11: transferred 262 clusters with 20 clusters populated
In stage 12: trans

  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_Maze Guru - AI Art, Anime & Social - Chat - üí¨ÔΩúenglish-discussion [1037291441434677268] (2022-11-01 to 2023-04-01)_anonymized.csv
In stage 0: transferred 1290 clusters with 20 clusters populated
In stage 1: transferred 1043 clusters with 20 clusters populated
In stage 2: transferred 1011 clusters with 20 clusters populated
In stage 3: transferred 964 clusters with 20 clusters populated
In stage 4: transferred 958 clusters with 20 clusters populated
In stage 5: transferred 944 clusters with 20 clusters populated
In stage 6: transferred 932 clusters with 20 clusters populated
In stage 7: transferred 942 clusters with 20 clusters populated
In stage 8: transferred 919 clusters with 20 clusters populated
In stage 9: transferred 941 clusters with 20 clusters populated
In stage 10: transferred 899 clusters with 20 clusters populated
In stage 11: transferred 904 clusters with 20 clusters populated
In stage 12: tr

  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_üëΩ WOMBOVERSE üëΩ - „Ää üëΩ„ÄãLOUNGE - üí¨‚îÇgeneral-chat [774124295524712480] (2024-01-01 to 2024-03-01)_anonymized.csv
In stage 0: transferred 6712 clusters with 20 clusters populated
In stage 1: transferred 5868 clusters with 20 clusters populated
In stage 2: transferred 5356 clusters with 20 clusters populated
In stage 3: transferred 5162 clusters with 20 clusters populated
In stage 4: transferred 5057 clusters with 20 clusters populated
In stage 5: transferred 4916 clusters with 20 clusters populated
In stage 6: transferred 4884 clusters with 20 clusters populated
In stage 7: transferred 4867 clusters with 20 clusters populated
In stage 8: transferred 4869 clusters with 20 clusters populated
In stage 9: transferred 4830 clusters with 20 clusters populated
In stage 10: transferred 4831 clusters with 20 clusters populated
In stage 11: transferred 4768 clusters with 20 clusters populated
In stage 12: t

  pdf.savefig()
  pdf.savefig()
  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_üëΩ WOMBOVERSE üëΩ - „Ää üëΩ„ÄãLOUNGE - üí¨‚îÇgeneral-chat [774124295524712480] (2020-11-01 to 2021-12-01)_anonymized.csv
In stage 0: transferred 10151 clusters with 20 clusters populated
In stage 1: transferred 8997 clusters with 20 clusters populated
In stage 2: transferred 8334 clusters with 20 clusters populated
In stage 3: transferred 7966 clusters with 20 clusters populated
In stage 4: transferred 7793 clusters with 20 clusters populated
In stage 5: transferred 7628 clusters with 20 clusters populated
In stage 6: transferred 7648 clusters with 20 clusters populated
In stage 7: transferred 7494 clusters with 20 clusters populated
In stage 8: transferred 7368 clusters with 20 clusters populated
In stage 9: transferred 7384 clusters with 20 clusters populated
In stage 10: transferred 7331 clusters with 20 clusters populated
In stage 11: transferred 7325 clusters with 20 clusters populated
In stage 12: 

  pdf.savefig()
  pdf.savefig()
  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_Midjourney - Chat - discussion [938713143759216720] (2022-02-01 to 2022-04-01)_anonymized.csv
In stage 0: transferred 6983 clusters with 20 clusters populated
In stage 1: transferred 5952 clusters with 20 clusters populated
In stage 2: transferred 5411 clusters with 20 clusters populated
In stage 3: transferred 5047 clusters with 20 clusters populated
In stage 4: transferred 4843 clusters with 20 clusters populated
In stage 5: transferred 4731 clusters with 20 clusters populated
In stage 6: transferred 4637 clusters with 20 clusters populated
In stage 7: transferred 4613 clusters with 20 clusters populated
In stage 8: transferred 4531 clusters with 20 clusters populated
In stage 9: transferred 4500 clusters with 20 clusters populated
In stage 10: transferred 4494 clusters with 20 clusters populated
In stage 11: transferred 4536 clusters with 20 clusters populated
In stage 12: transferred 4508 clusters with 20

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_LimeWire - create AI Images, Audio & Video - Community - üè†Ô∏±general [1051607872695455884]_anonymized.csv
In stage 0: transferred 3662 clusters with 20 clusters populated
In stage 1: transferred 2828 clusters with 20 clusters populated
In stage 2: transferred 2445 clusters with 20 clusters populated
In stage 3: transferred 2289 clusters with 20 clusters populated
In stage 4: transferred 2221 clusters with 20 clusters populated
In stage 5: transferred 2168 clusters with 20 clusters populated
In stage 6: transferred 2133 clusters with 20 clusters populated
In stage 7: transferred 2143 clusters with 20 clusters populated
In stage 8: transferred 2159 clusters with 20 clusters populated
In stage 9: transferred 2117 clusters with 20 clusters populated
In stage 10: transferred 2125 clusters with 20 clusters populated
In stage 11: transferred 2113 clusters with 20 clusters populated
In stage 12: transferred 2128 c

  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_PromptHero - PromptHero - üó®‚îÇgeneral [1030178486398234684]_anonymized.csv
In stage 0: transferred 65 clusters with 20 clusters populated
In stage 1: transferred 56 clusters with 20 clusters populated
In stage 2: transferred 55 clusters with 20 clusters populated
In stage 3: transferred 53 clusters with 20 clusters populated
In stage 4: transferred 51 clusters with 20 clusters populated
In stage 5: transferred 46 clusters with 20 clusters populated
In stage 6: transferred 54 clusters with 20 clusters populated
In stage 7: transferred 58 clusters with 20 clusters populated
In stage 8: transferred 51 clusters with 20 clusters populated
In stage 9: transferred 49 clusters with 20 clusters populated
In stage 10: transferred 53 clusters with 20 clusters populated
In stage 11: transferred 49 clusters with 20 clusters populated
In stage 12: transferred 44 clusters with 20 clusters populated
In stage 13: transferr

  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_LimeWire - create AI Images, Audio & Video - Community - üè†Ô∏±general [1051607872695455884] (1)_anonymized.csv
In stage 0: transferred 3248 clusters with 20 clusters populated
In stage 1: transferred 2849 clusters with 20 clusters populated
In stage 2: transferred 2593 clusters with 20 clusters populated
In stage 3: transferred 2430 clusters with 20 clusters populated
In stage 4: transferred 2342 clusters with 20 clusters populated
In stage 5: transferred 2305 clusters with 20 clusters populated
In stage 6: transferred 2264 clusters with 20 clusters populated
In stage 7: transferred 2203 clusters with 20 clusters populated
In stage 8: transferred 2157 clusters with 20 clusters populated
In stage 9: transferred 2190 clusters with 20 clusters populated
In stage 10: transferred 2165 clusters with 20 clusters populated
In stage 11: transferred 2065 clusters with 20 clusters populated
In stage 12: transferred 20

  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_r_StableDiffusion - SD Main - general [1031106064776712226] (2022-12-01 to 2023-02-01)_anonymized.csv
In stage 0: transferred 4873 clusters with 20 clusters populated
In stage 1: transferred 4067 clusters with 20 clusters populated
In stage 2: transferred 3733 clusters with 20 clusters populated
In stage 3: transferred 3466 clusters with 20 clusters populated
In stage 4: transferred 3411 clusters with 20 clusters populated
In stage 5: transferred 3282 clusters with 20 clusters populated
In stage 6: transferred 3207 clusters with 20 clusters populated
In stage 7: transferred 3169 clusters with 20 clusters populated
In stage 8: transferred 3145 clusters with 20 clusters populated
In stage 9: transferred 3162 clusters with 20 clusters populated
In stage 10: transferred 3157 clusters with 20 clusters populated
In stage 11: transferred 3129 clusters with 20 clusters populated
In stage 12: transferred 3187 clusters

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_r_StableDiffusion - SD Main - general [1031106064776712226] (2024-01-01 to 2024-03-01)_anonymized.csv
In stage 0: transferred 3225 clusters with 20 clusters populated
In stage 1: transferred 2663 clusters with 20 clusters populated
In stage 2: transferred 2482 clusters with 20 clusters populated
In stage 3: transferred 2352 clusters with 20 clusters populated
In stage 4: transferred 2354 clusters with 20 clusters populated
In stage 5: transferred 2259 clusters with 20 clusters populated
In stage 6: transferred 2232 clusters with 20 clusters populated
In stage 7: transferred 2241 clusters with 20 clusters populated
In stage 8: transferred 2193 clusters with 20 clusters populated
In stage 9: transferred 2158 clusters with 20 clusters populated
In stage 10: transferred 2183 clusters with 20 clusters populated
In stage 11: transferred 2172 clusters with 20 clusters populated
In stage 12: transferred 2162 clusters

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>