In [1]:
!pip install git+https://github.com/rwalk/gsdmm.git
!pip install nltk spacy gensim pyLDAvis networkx
!pip install wordcloud

import os
import pandas as pd
import numpy as np
import re
import random
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
import nltk
import spacy
import gensim
from gsdmm import MovieGroupProcess
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models as gensimvis
import networkx as nx
from collections import Counter
from itertools import combinations

# Load Spacy's English model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")

# Mount Google Drive to access the dataset stored there
from google.colab import drive
drive.mount('/content/drive')

# Download necessary NLTK resources for tokenization and POS tagging
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Enhance the stop words list with terms commonly irrelevant in casual discussions
additional_stopwords = {'get', 'okay', 'oh', 'yeah', 'hey', 'hi', 'please', 'thank', 'welcome', 'like', 'just', 'know', 'really', 'thing', 'things', 'said', 'also', 'one', 'use', 'using', 'used'}

# Context-specific terms related to AI-generated content discussions
contextual_stopwords = {'help', 'think', 'probably', 'topic', 'question', 'ask', 'need'}

# AI-generated content specific terms
aigc_stopwords = {'ai', 'artificial', 'intelligence', 'generated', 'generate', 'generation', 'content', 'image', 'images', 'video', 'videos', 'picture', 'pictures', 'photo', 'photos', 'graphic', 'graphics', 'stable', 'diffusion', 'model', 'models'}

# Platform-specific terms related to Discord discussions
discord_stopwords = {'discord', 'server', 'midjourney', 'limewire', 'womboverse', 'maze', 'guru', 'prompthero', 'stablediffusion'}

# Combine all sets of stop words
stop_words = set(stopwords.words('english')) | additional_stopwords | aigc_stopwords | contextual_stopwords | discord_stopwords

# Define the preprocessing function
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    # Remove all non-word characters and lower the text
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Split text into tokens
    tokens = text.split()
    # Apply NER to remove usernames and named entities
    doc = nlp(' '.join(tokens))
    tokens = [token.text for token in doc if not token.ent_type_]
    # POS ÁöÑ‰ΩçÁΩÆÊèêÂâç‰∫ÜÔºåÊú¨Êù•Âú®ÊúÄÂêé
    # Retain only nouns as they are significant for topic identification
    nouns = [word for word, pos in pos_tag(tokens) if pos.startswith('NN')]
    # Á≠õÈÄâÊú¨Êù• lemmatize Âíå len(word) > 2 ÊòØÂú®Âêå‰∏ÄË°åÁöÑÔºåÊúâÂèØËÉΩ lemmatize ‰πãÂêé word length Â∞è‰∫é 2ÔºåÊâÄ‰ª•ËøôÈáåÊääÈïøÂ∫¶Á≠õÈÄâÊîæÂú®ÂêéÈù¢‰∫Ü
    # Lemmatize tokens and remove stop words and short words
    nouns = [lemmatizer.lemmatize(word) for word in nouns]
    nouns = [word for word in nouns if word not in stop_words and len(word) > 2]
    return nouns

# Function to load and preprocess data from CSV files
def load_data(file_path):
    df = pd.read_csv(file_path)
    df['preprocessed'] = df['Content'].apply(preprocess)
    return df['preprocessed'].tolist()

# Path to the folder containing the dataset
folder_path = '/content/drive/My Drive/Discord dataset/Cleaned data/'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

# Function to randomly select messages from each topic for manual review
def random_message_selection(docs, cluster_assignments, num_samples=5):
    topics = set(cluster_assignments)
    for topic_id in topics:
        topic_docs = [doc for i, doc in enumerate(docs) if cluster_assignments[i] == topic_id]
        sampled_messages = random.sample(topic_docs, min(len(topic_docs), num_samples))
        print(f"\nTopic {topic_id}: Sample Messages for Review")
        for message in sampled_messages:
            print(' '.join(message))

# Function to save plots to a PDF
from matplotlib.backends.backend_pdf import PdfPages

def save_plots_to_pdf(file_path, cluster_word_distribution, top_clusters):
    pdf_file_name = f'{os.path.basename(file_path)}_report.pdf'
    with PdfPages(pdf_file_name) as pdf:
        for topic_id in top_clusters:
            # Word cloud
            topic_words = cluster_word_distribution[topic_id]
            wordcloud = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(topic_words)
            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.title(f"Topic {topic_id} Word Cloud for {os.path.basename(file_path)}")
            plt.axis("off")
            pdf.savefig()
            plt.close()

            # Bar chart
            sorted_topic_words = dict(sorted(topic_words.items(), key=lambda item: item[1], reverse=True)[:10])
            plt.figure(figsize=(10, 5))
            plt.bar(sorted_topic_words.keys(), sorted_topic_words.values())
            plt.title(f"Top Words in Topic {topic_id}")
            plt.xticks(rotation=45)
            pdf.savefig()
            plt.close()

            # Network graph
            words = list(sorted_topic_words.keys())
            word_pairs = list(combinations(words, 2))
            G = nx.Graph()
            G.add_edges_from(word_pairs)
            pos = nx.spring_layout(G)
            plt.figure(figsize=(10, 5))
            nx.draw(G, pos, with_labels=True, node_size=50, font_size=10, edge_color='grey')
            plt.title(f"Word Co-occurrence Network for Topic {topic_id}")
            pdf.savefig()
            plt.close()

    return pdf_file_name

# Process each file, perform topic modeling, and generate visualizations
for file_path in file_paths:
    print(f"Processing file: {file_path}")
    docs = load_data(file_path)
    # Remove any empty documents resulting from preprocessing
    docs = [doc for doc in docs if doc]

    if len(docs) > 0:
        # Create a dictionary of all words in documents
        dictionary = gensim.corpora.Dictionary(docs)
        # Remove extremes to focus on relevant words only
        dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=10000)
        vocab_length = len(dictionary)

        if vocab_length == 0:
            print("No vocabulary available after filtering extremes. Skipping.")
            continue

        # Initialize and fit GSDMM model
        mgp = MovieGroupProcess(K=20, alpha=0.1, beta=0.3, n_iters=30)
        y = mgp.fit(docs, vocab_length)

        # Get cluster assignments and cluster word distributions
        cluster_assignments = y
        cluster_word_distribution = mgp.cluster_word_distribution

        # Random message selection for validation
        random_message_selection(docs, cluster_assignments)

        # Generate and display word clouds for each of the top 5 topics
        sorted_clusters = sorted(range(len(mgp.cluster_doc_count)), key=lambda k: mgp.cluster_doc_count[k], reverse=True)
        top_clusters = sorted_clusters[:5]

        # Save plots to PDF
        pdf_file_name = save_plots_to_pdf(file_path, cluster_word_distribution, top_clusters)

        # Provide a download link for the PDF file
        from google.colab import files
        files.download(pdf_file_name)














Collecting git+https://github.com/rwalk/gsdmm.git
  Cloning https://github.com/rwalk/gsdmm.git to /tmp/pip-req-build-pmpokfv2
  Running command git clone --filter=blob:none --quiet https://github.com/rwalk/gsdmm.git /tmp/pip-req-build-pmpokfv2
  Resolved https://github.com/rwalk/gsdmm.git to commit 4ad1b6b6976743681ee4976b4573463d359214ee
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gsdmm
  Building wheel for gsdmm (setup.py) ... [?25l[?25hdone
  Created wheel for gsdmm: filename=gsdmm-0.1-py3-none-any.whl size=4586 sha256=2bd06af7801a736e0439ec0e02c6b44bb9aac57bcfb1a6c2a24e914a35db4f80
  Stored in directory: /tmp/pip-ephem-wheel-cache-5obojgl7/wheels/da/d3/6e/a612d7cff0fcfb6470b8c113fc04931ecffb466ac19b9c5f3c
Successfully built gsdmm
Installing collected packages: gsdmm
Successfully installed gsdmm-0.1
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading func

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_Midjourney - Chat - discussion [938713143759216720] (2022-01-01 to 2022-07-01)_anonymized.csv
In stage 0: transferred 54536 clusters with 20 clusters populated
In stage 1: transferred 49632 clusters with 20 clusters populated
In stage 2: transferred 45627 clusters with 20 clusters populated
In stage 3: transferred 41825 clusters with 20 clusters populated
In stage 4: transferred 38985 clusters with 20 clusters populated
In stage 5: transferred 37380 clusters with 20 clusters populated
In stage 6: transferred 36393 clusters with 20 clusters populated
In stage 7: transferred 35666 clusters with 20 clusters populated
In stage 8: transferred 35224 clusters with 20 clusters populated
In stage 9: transferred 34941 clusters with 20 clusters populated
In stage 10: transferred 34519 clusters with 20 clusters populated
In stage 11: transferred 34464 clusters with 20 clusters populated
In stage 12: transferred 34179 clu

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_PromptHero - PromptHero - üó®‚îÇgeneral [1030178486398234684] (2022-10-13 to 2023-01-01)_anonymized.csv
In stage 0: transferred 710 clusters with 20 clusters populated
In stage 1: transferred 623 clusters with 20 clusters populated
In stage 2: transferred 569 clusters with 20 clusters populated
In stage 3: transferred 563 clusters with 20 clusters populated
In stage 4: transferred 557 clusters with 20 clusters populated
In stage 5: transferred 539 clusters with 20 clusters populated
In stage 6: transferred 544 clusters with 20 clusters populated
In stage 7: transferred 565 clusters with 20 clusters populated
In stage 8: transferred 556 clusters with 20 clusters populated
In stage 9: transferred 559 clusters with 20 clusters populated
In stage 10: transferred 563 clusters with 20 clusters populated
In stage 11: transferred 572 clusters with 20 clusters populated
In stage 12: transferred 565 clusters with 20 c

  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_Maze Guru - AI Art, Anime & Social - Chat - üí¨ÔΩúenglish-discussion [1037291441434677268] (2024-01-01 to 2024-03-01)_anonymized.csv
In stage 0: transferred 314 clusters with 20 clusters populated
In stage 1: transferred 265 clusters with 20 clusters populated
In stage 2: transferred 275 clusters with 20 clusters populated
In stage 3: transferred 272 clusters with 20 clusters populated
In stage 4: transferred 248 clusters with 20 clusters populated
In stage 5: transferred 259 clusters with 20 clusters populated
In stage 6: transferred 237 clusters with 20 clusters populated
In stage 7: transferred 256 clusters with 20 clusters populated
In stage 8: transferred 252 clusters with 20 clusters populated
In stage 9: transferred 247 clusters with 20 clusters populated
In stage 10: transferred 259 clusters with 20 clusters populated
In stage 11: transferred 240 clusters with 20 clusters populated
In stage 12: trans

  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_Maze Guru - AI Art, Anime & Social - Chat - üí¨ÔΩúenglish-discussion [1037291441434677268] (2022-11-01 to 2023-04-01)_anonymized.csv
In stage 0: transferred 1257 clusters with 20 clusters populated
In stage 1: transferred 1086 clusters with 20 clusters populated
In stage 2: transferred 1001 clusters with 20 clusters populated
In stage 3: transferred 957 clusters with 20 clusters populated
In stage 4: transferred 945 clusters with 20 clusters populated
In stage 5: transferred 944 clusters with 20 clusters populated
In stage 6: transferred 973 clusters with 20 clusters populated
In stage 7: transferred 926 clusters with 20 clusters populated
In stage 8: transferred 921 clusters with 20 clusters populated
In stage 9: transferred 917 clusters with 20 clusters populated
In stage 10: transferred 941 clusters with 20 clusters populated
In stage 11: transferred 948 clusters with 20 clusters populated
In stage 12: tr

  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_üëΩ WOMBOVERSE üëΩ - „Ää üëΩ„ÄãLOUNGE - üí¨‚îÇgeneral-chat [774124295524712480] (2024-01-01 to 2024-03-01)_anonymized.csv
In stage 0: transferred 6555 clusters with 20 clusters populated
In stage 1: transferred 5772 clusters with 20 clusters populated
In stage 2: transferred 5442 clusters with 20 clusters populated
In stage 3: transferred 5206 clusters with 20 clusters populated
In stage 4: transferred 5074 clusters with 20 clusters populated
In stage 5: transferred 4987 clusters with 20 clusters populated
In stage 6: transferred 4857 clusters with 20 clusters populated
In stage 7: transferred 4909 clusters with 20 clusters populated
In stage 8: transferred 4835 clusters with 20 clusters populated
In stage 9: transferred 4818 clusters with 20 clusters populated
In stage 10: transferred 4755 clusters with 20 clusters populated
In stage 11: transferred 4785 clusters with 20 clusters populated
In stage 12: t

  pdf.savefig()
  pdf.savefig()
  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_üëΩ WOMBOVERSE üëΩ - „Ää üëΩ„ÄãLOUNGE - üí¨‚îÇgeneral-chat [774124295524712480] (2020-11-01 to 2021-12-01)_anonymized.csv
In stage 0: transferred 9851 clusters with 20 clusters populated
In stage 1: transferred 8714 clusters with 20 clusters populated
In stage 2: transferred 8180 clusters with 20 clusters populated
In stage 3: transferred 7827 clusters with 20 clusters populated
In stage 4: transferred 7693 clusters with 20 clusters populated
In stage 5: transferred 7503 clusters with 20 clusters populated
In stage 6: transferred 7449 clusters with 20 clusters populated
In stage 7: transferred 7320 clusters with 20 clusters populated
In stage 8: transferred 7235 clusters with 20 clusters populated
In stage 9: transferred 7163 clusters with 20 clusters populated
In stage 10: transferred 7115 clusters with 20 clusters populated
In stage 11: transferred 7060 clusters with 20 clusters populated
In stage 12: t

  pdf.savefig()
  pdf.savefig()
  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_Midjourney - Chat - discussion [938713143759216720] (2022-02-01 to 2022-04-01)_anonymized.csv
In stage 0: transferred 6864 clusters with 20 clusters populated
In stage 1: transferred 5823 clusters with 20 clusters populated
In stage 2: transferred 5276 clusters with 20 clusters populated
In stage 3: transferred 5017 clusters with 20 clusters populated
In stage 4: transferred 4778 clusters with 20 clusters populated
In stage 5: transferred 4779 clusters with 20 clusters populated
In stage 6: transferred 4657 clusters with 20 clusters populated
In stage 7: transferred 4676 clusters with 20 clusters populated
In stage 8: transferred 4626 clusters with 20 clusters populated
In stage 9: transferred 4526 clusters with 20 clusters populated
In stage 10: transferred 4507 clusters with 20 clusters populated
In stage 11: transferred 4543 clusters with 20 clusters populated
In stage 12: transferred 4481 clusters with 20

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_LimeWire - create AI Images, Audio & Video - Community - üè†Ô∏±general [1051607872695455884]_anonymized.csv
In stage 0: transferred 3490 clusters with 20 clusters populated
In stage 1: transferred 2665 clusters with 20 clusters populated
In stage 2: transferred 2395 clusters with 20 clusters populated
In stage 3: transferred 2239 clusters with 20 clusters populated
In stage 4: transferred 2179 clusters with 20 clusters populated
In stage 5: transferred 2142 clusters with 20 clusters populated
In stage 6: transferred 2123 clusters with 20 clusters populated
In stage 7: transferred 2050 clusters with 20 clusters populated
In stage 8: transferred 2088 clusters with 20 clusters populated
In stage 9: transferred 2029 clusters with 20 clusters populated
In stage 10: transferred 2082 clusters with 20 clusters populated
In stage 11: transferred 2033 clusters with 20 clusters populated
In stage 12: transferred 1992 c

  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_PromptHero - PromptHero - üó®‚îÇgeneral [1030178486398234684]_anonymized.csv
In stage 0: transferred 62 clusters with 20 clusters populated
In stage 1: transferred 48 clusters with 20 clusters populated
In stage 2: transferred 47 clusters with 20 clusters populated
In stage 3: transferred 44 clusters with 20 clusters populated
In stage 4: transferred 54 clusters with 20 clusters populated
In stage 5: transferred 59 clusters with 20 clusters populated
In stage 6: transferred 52 clusters with 20 clusters populated
In stage 7: transferred 50 clusters with 20 clusters populated
In stage 8: transferred 49 clusters with 20 clusters populated
In stage 9: transferred 49 clusters with 20 clusters populated
In stage 10: transferred 49 clusters with 20 clusters populated
In stage 11: transferred 49 clusters with 20 clusters populated
In stage 12: transferred 51 clusters with 20 clusters populated
In stage 13: transferr

  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_LimeWire - create AI Images, Audio & Video - Community - üè†Ô∏±general [1051607872695455884] (1)_anonymized.csv
In stage 0: transferred 3053 clusters with 20 clusters populated
In stage 1: transferred 2679 clusters with 20 clusters populated
In stage 2: transferred 2473 clusters with 20 clusters populated
In stage 3: transferred 2349 clusters with 20 clusters populated
In stage 4: transferred 2310 clusters with 20 clusters populated
In stage 5: transferred 2228 clusters with 20 clusters populated
In stage 6: transferred 2143 clusters with 20 clusters populated
In stage 7: transferred 2141 clusters with 20 clusters populated
In stage 8: transferred 2119 clusters with 20 clusters populated
In stage 9: transferred 2135 clusters with 20 clusters populated
In stage 10: transferred 2188 clusters with 20 clusters populated
In stage 11: transferred 2130 clusters with 20 clusters populated
In stage 12: transferred 20

  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_r_StableDiffusion - SD Main - general [1031106064776712226] (2022-12-01 to 2023-02-01)_anonymized.csv
In stage 0: transferred 4714 clusters with 20 clusters populated
In stage 1: transferred 4011 clusters with 20 clusters populated
In stage 2: transferred 3646 clusters with 20 clusters populated
In stage 3: transferred 3482 clusters with 20 clusters populated
In stage 4: transferred 3361 clusters with 20 clusters populated
In stage 5: transferred 3340 clusters with 20 clusters populated
In stage 6: transferred 3296 clusters with 20 clusters populated
In stage 7: transferred 3277 clusters with 20 clusters populated
In stage 8: transferred 3128 clusters with 20 clusters populated
In stage 9: transferred 3166 clusters with 20 clusters populated
In stage 10: transferred 3148 clusters with 20 clusters populated
In stage 11: transferred 3128 clusters with 20 clusters populated
In stage 12: transferred 3135 clusters

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_r_StableDiffusion - SD Main - general [1031106064776712226] (2024-01-01 to 2024-03-01)_anonymized.csv
In stage 0: transferred 3151 clusters with 20 clusters populated
In stage 1: transferred 2613 clusters with 20 clusters populated
In stage 2: transferred 2431 clusters with 20 clusters populated
In stage 3: transferred 2365 clusters with 20 clusters populated
In stage 4: transferred 2328 clusters with 20 clusters populated
In stage 5: transferred 2307 clusters with 20 clusters populated
In stage 6: transferred 2278 clusters with 20 clusters populated
In stage 7: transferred 2216 clusters with 20 clusters populated
In stage 8: transferred 2182 clusters with 20 clusters populated
In stage 9: transferred 2193 clusters with 20 clusters populated
In stage 10: transferred 2142 clusters with 20 clusters populated
In stage 11: transferred 2191 clusters with 20 clusters populated
In stage 12: transferred 2213 clusters

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>