In [1]:
!pip install git+https://github.com/rwalk/gsdmm.git
!pip install nltk spacy gensim pyLDAvis networkx wordcloud

import os
import pandas as pd
import numpy as np
import re
import random
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
import nltk
import spacy
import gensim
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import networkx as nx
from collections import Counter
from itertools import combinations
from matplotlib.backends.backend_pdf import PdfPages

# Load Spacy's English model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")

# Mount Google Drive to access the dataset stored there
from google.colab import drive
drive.mount('/content/drive')

# Download necessary NLTK resources for tokenization and POS tagging
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Enhance the stop words list with terms commonly irrelevant in casual discussions
additional_stopwords = {'get', 'okay', 'oh', 'yeah', 'hey', 'hi', 'please', 'thank', 'welcome', 'like', 'just', 'know', 'really', 'thing', 'things', 'said', 'also', 'one', 'use', 'using', 'used'}

# Context-specific terms related to AI-generated content discussions
contextual_stopwords = {'help', 'think', 'probably', 'topic', 'question', 'ask', 'need'}

# AI-generated content specific terms
aigc_stopwords = {'ai', 'artificial', 'intelligence', 'generated', 'generate', 'generation', 'content', 'image', 'images', 'video', 'videos', 'picture', 'pictures', 'photo', 'photos', 'graphic', 'graphics', 'stable', 'diffusion', 'model', 'models'}

# Platform-specific terms related to Discord discussions
discord_stopwords = {'discord', 'server', 'midjourney', 'limewire', 'womboverse', 'maze', 'guru', 'prompthero', 'stablediffusion'}

# Combine all sets of stop words
stop_words = set(stopwords.words('english')) | additional_stopwords | aigc_stopwords | contextual_stopwords | discord_stopwords

# Define the preprocessing function
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    # Remove all non-word characters and lower the text
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Split text into tokens
    tokens = text.split()
    # Apply NER to remove usernames and named entities
    doc = nlp(' '.join(tokens))
    tokens = [token.text for token in doc if not token.ent_type_]
    # POS ÁöÑ‰ΩçÁΩÆÊèêÂâç‰∫ÜÔºåÊú¨Êù•Âú®ÊúÄÂêé
    # Retain only nouns as they are significant for topic identification
    nouns = [word for word, pos in pos_tag(tokens) if pos.startswith('NN')]
    # Á≠õÈÄâÊú¨Êù• lemmatize Âíå len(word) > 2 ÊòØÂú®Âêå‰∏ÄË°åÁöÑÔºåÊúâÂèØËÉΩ lemmatize ‰πãÂêé word length Â∞è‰∫é 2ÔºåÊâÄ‰ª•ËøôÈáåÊääÈïøÂ∫¶Á≠õÈÄâÊîæÂú®ÂêéÈù¢‰∫Ü
    # Lemmatize tokens and remove stop words and short words
    nouns = [lemmatizer.lemmatize(word) for word in nouns]
    nouns = [word for word in nouns if word not in stop_words and len(word) > 2]
    return nouns

# Function to load and preprocess data from CSV files
def load_data(file_path):
    df = pd.read_csv(file_path)
    df['preprocessed'] = df['Content'].apply(preprocess)
    return df['preprocessed'].tolist()

# Path to the folder containing the dataset
folder_path = '/content/drive/My Drive/Discord dataset/Cleaned data/'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

# Function to evaluate coherence
def evaluate_coherence(dictionary, docs, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=[dictionary.doc2bow(doc) for doc in docs],
                                                id2word=dictionary,
                                                num_topics=num_topics,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)
        model_list.append(model)
        coherencemodel = gensim.models.CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

# Function to randomly select messages from each topic for manual review
def random_message_selection(docs, best_model, dictionary, num_samples=5):
    topics = best_model.show_topics(num_words=10, formatted=False)
    for topic_id, _ in topics[:5]:  # Limit to top 5 topics
        topic_docs = [doc for doc in docs if any(topic_id == topic[0] for topic in best_model.get_document_topics(dictionary.doc2bow(doc), minimum_probability=0.1))]
        sampled_messages = random.sample(topic_docs, min(len(topic_docs), num_samples))
        print(f"\nTopic {topic_id}: Sample Messages for Review")
        for message in sampled_messages:
            print(' '.join(message))

# Function to save plots to a PDF
def save_plots_to_pdf(file_path, best_model, dictionary, top_clusters):
    pdf_file_name = f'{os.path.basename(file_path)}_report.pdf'
    with PdfPages(pdf_file_name) as pdf:
        for topic_id in top_clusters:
            # Word cloud
            topic_words = dict(best_model.show_topic(topic_id, topn=50))
            wordcloud = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(topic_words)
            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.title(f"Topic {topic_id} Word Cloud for {os.path.basename(file_path)}")
            plt.axis("off")
            pdf.savefig()
            plt.close()

            # Bar chart
            sorted_topic_words = dict(sorted(topic_words.items(), key=lambda item: item[1], reverse=True)[:10])
            plt.figure(figsize=(10, 5))
            plt.bar(sorted_topic_words.keys(), sorted_topic_words.values())
            plt.title(f"Top Words in Topic {topic_id}")
            plt.xticks(rotation=45)
            pdf.savefig()
            plt.close()

            # Network graph
            words = list(sorted_topic_words.keys())
            word_pairs = list(combinations(words, 2))
            G = nx.Graph()
            G.add_edges_from(word_pairs)
            pos = nx.spring_layout(G)
            plt.figure(figsize=(10, 5))
            nx.draw(G, pos, with_labels=True, node_size=50, font_size=10, edge_color='grey')
            plt.title(f"Word Co-occurrence Network for Topic {topic_id}")
            pdf.savefig()
            plt.close()

    return pdf_file_name

# Process each file, perform topic modeling, and generate visualizations
for file_path in file_paths:
    print(f"Processing file: {file_path}")
    docs = load_data(file_path)
    # Remove any empty documents resulting from preprocessing
    docs = [doc for doc in docs if doc]

    if len(docs) > 0:
        # Create a dictionary of all words in documents
        dictionary = gensim.corpora.Dictionary(docs)
        # Remove extremes to focus on relevant words only
        dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=10000)
        vocab_length = len(dictionary)

        if vocab_length == 0:
            print("No vocabulary available after filtering extremes. Skipping.")
            continue

        # Evaluate coherence for different number of topics
        model_list, coherence_values = evaluate_coherence(dictionary, docs, limit=20)  # Increase to explore more topics

        # Select the model with the highest coherence
        best_model_index = coherence_values.index(max(coherence_values))
        best_model = model_list[best_model_index]

        # Random message selection for validation
        random_message_selection(docs, best_model, dictionary)

        # Visualize the topics using pyLDAvis
        try:
            lda_display = gensimvis.prepare(best_model, [dictionary.doc2bow(doc) for doc in docs], dictionary)
            pyLDAvis.display(lda_display)
        except Exception as e:
            print(f"Error in pyLDAvis visualization: {e}")

        # Generate and display word clouds for each of the top 5 topics
        sorted_clusters = sorted(range(best_model.num_topics), key=lambda k: sum([tup[1] for tup in best_model.get_topic_terms(k)]), reverse=True)
        top_clusters = sorted_clusters[:5]

        # Save plots to PDF
        pdf_file_name = save_plots_to_pdf(file_path, best_model, dictionary, top_clusters)

        # Provide a link to download the PDF file
        from google.colab import files
        files.download(pdf_file_name)
    else:
        print("Not enough data to build the model or all documents are empty.")
















Collecting git+https://github.com/rwalk/gsdmm.git
  Cloning https://github.com/rwalk/gsdmm.git to /tmp/pip-req-build-xo_1382o
  Running command git clone --filter=blob:none --quiet https://github.com/rwalk/gsdmm.git /tmp/pip-req-build-xo_1382o
  Resolved https://github.com/rwalk/gsdmm.git to commit 4ad1b6b6976743681ee4976b4573463d359214ee
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gsdmm
  Building wheel for gsdmm (setup.py) ... [?25l[?25hdone
  Created wheel for gsdmm: filename=gsdmm-0.1-py3-none-any.whl size=4586 sha256=f6cb912b0d38a4beab9effbd370080a9449da1a7d646f6672be2e5059096138c
  Stored in directory: /tmp/pip-ephem-wheel-cache-j9ktm7fo/wheels/da/d3/6e/a612d7cff0fcfb6470b8c113fc04931ecffb466ac19b9c5f3c
Successfully built gsdmm
Installing collected packages: gsdmm
Successfully installed gsdmm-0.1
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading func

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_Midjourney - Chat - discussion [938713143759216720] (2022-01-01 to 2022-07-01)_anonymized.csv

Topic 8: Sample Messages for Review
yes head input place kinda process work
stuff author name
yes
yes
screenshot error

Topic 2: Sample Messages for Review
issue list status
someone edge case
upscaler update quality output part non version overpaint
term nfts case team
lot case stuff bit moss left patch moss everything moss

Topic 5: Sample Messages for Review
area tech place anyone trauma form therapy professional
license part
moderation policy stuff
init accuracy
dataset time part

Topic 9: Sample Messages for Review
limit maybe auto
stuff status channel deletedchannel thread
dalle network
channel thread website
test

Topic 11: Sample Messages for Review
coshmeewii button
way variation control
attribute seed matter
track seed seed element gpu variance nothwithstanding
ceo rainbow mode room button commads


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_PromptHero - PromptHero - üó®‚îÇgeneral [1030178486398234684] (2022-10-13 to 2023-01-01)_anonymized.csv

Topic 12: Sample Messages for Review

Topic 3: Sample Messages for Review

Topic 5: Sample Messages for Review
github page screencast
page pic
community page thanks love community
page
front page

Topic 10: Sample Messages for Review
platform site version history community discussion section example prompt setting
community openjourney
follower kind deal exposure everyone link community
everyone community everyone glad news something
community tab bias httpspromptherocomcommunity

Topic 8: Sample Messages for Review
sampler effect resolution effect time user
sampler heun slowness lot
yes sampler
extension version v103 support sampler httpsgithubcomhe1coautofillpromptherouploads
news version beta access parameter


  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_Maze Guru - AI Art, Anime & Social - Chat - üí¨ÔΩúenglish-discussion [1037291441434677268] (2024-01-01 to 2024-03-01)_anonymized.csv

Topic 7: Sample Messages for Review

Topic 10: Sample Messages for Review
app app store
app web subgenres
point app website
access repair app
point web app point contest

Topic 9: Sample Messages for Review
people
people mod timezones someone mod kind
group people world way someone
people prompt link share
lot people sexy woman meat

Topic 5: Sample Messages for Review
example end degree head team line porn seed worm someone word pornography porn default mode
people mod timezones someone mod kind
group people world way someone
time someone aware
power ranger someone

Topic 0: Sample Messages for Review
prompt
top prompt
pant amount prompt
prompt
point lot premium prompt month


  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_Maze Guru - AI Art, Anime & Social - Chat - üí¨ÔΩúenglish-discussion [1037291441434677268] (2022-11-01 to 2023-04-01)_anonymized.csv

Topic 4: Sample Messages for Review
creation game
team promising game tester nft project megood luck
fairy size
prompthttpsmediadiscordappnetattachments10168050080979395561081675367078690866znjpgwidth240height426 detail
something prompt pornography reason pornography body portrait woman body extreme detail face1 intricate detail lip face artstation focus illustration artgerm size body term pronography face problem software something clothes

Topic 16: Sample Messages for Review
btw priority theme entry user people variation scroll art variation thread people entry entry person balance
web app user
yes weightthe concern word time way word number girl
blocking issue system user problem website
user

Topic 6: Sample Messages for Review
guy download anyone solution save quality
so

  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_üëΩ WOMBOVERSE üëΩ - „Ää üëΩ„ÄãLOUNGE - üí¨‚îÇgeneral-chat [774124295524712480] (2024-01-01 to 2024-03-01)_anonymized.csv

Topic 0: Sample Messages for Review
creation
creation nfts credit dreamwombot origin art
hello member
mistake mod everyone
gif taxi driver joker date story breadthink

Topic 1: Sample Messages for Review
coolguy
app output bot prompting bot access style bot
look move
suggestion
guy bot

Topic 2: Sample Messages for Review
understand anything sorry sleep someone
dirt
duck
hang bot space
something face lol

Topic 3: Sample Messages for Review
hmmm somehow channel mistaken humidity something brain
year
cat house
garou job cap eye
blood test appointment doc call queue moment yes choice cholesterol chip salt everything crisp sodium

Topic 4: Sample Messages for Review
bean sausage
aliencool
lol pleasure
user
input history app cache android setting


  pdf.savefig()
  pdf.savefig()
  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_üëΩ WOMBOVERSE üëΩ - „Ää üëΩ„ÄãLOUNGE - üí¨‚îÇgeneral-chat [774124295524712480] (2020-11-01 to 2021-12-01)_anonymized.csv

Topic 0: Sample Messages for Review
moderation bot
armit
womboart bot
bro stop rule anything send nude example rule
lol template man

Topic 2: Sample Messages for Review
lot friend
colour amrit role
app friend
someone pls comfort
morning

Topic 3: Sample Messages for Review
cool
channel
processor lot work phone phone
link work
work

Topic 4: Sample Messages for Review
song couple premium
nickname
pfp blackface girl
money
source

Topic 5: Sample Messages for Review
passport type pic expression movement
amazon account country
app future
emulator bluestacks pic face
toe pic


  pdf.savefig()
  pdf.savefig()
  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_Midjourney - Chat - discussion [938713143759216720] (2022-02-01 to 2022-04-01)_anonymized.csv

Topic 6: Sample Messages for Review
count nothing channel
anyone see
money tho fyi
cool tweet
thinking channel join anyone thought

Topic 2: Sample Messages for Review
feature request init clip stylization
style aaron griffin artist
case habit vaporwave temple cyberpunk temple god god salvador dali clip prompt style composition style compromise sentence example
feature website
yep backlog

Topic 4: Sample Messages for Review
user everyone rank helena ronjafman lucius chaos cody boy danielrussruss lurkwot crypto_iq300 eikyuu rainisto isaacq pancakes007 moultano heckpluto crung prankflamangos katrinatk graycrawford maxotronic somnai bentleyartdesign heavenslastangel gandamu dream ddickinson poler10 nicetry coltond cccpu
interface
try error log
everyones beta wave user wave people tool research group twitter page form 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_LimeWire - create AI Images, Audio & Video - Community - üè†Ô∏±general [1051607872695455884]_anonymized.csv

Topic 2: Sample Messages for Review
fee cost amount cover
problem website
nft price nowadays utility
item sale
fee

Topic 8: Sample Messages for Review
option object place
lol
pog name lol
nobody village rate
game

Topic 5: Sample Messages for Review
music
probabilityi guess
bluewillow read side creator music studio
music mnovies
face

Topic 4: Sample Messages for Review
premise bait product people business user subscriber platform bluewillow nothing
dope direction platform
support matter platform
admin lmwr distribution revenue
account subscription

Topic 3: Sample Messages for Review
subscribe link
water
right name logo info
front roadbaby girl hand
let someone chop rice grain


  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_PromptHero - PromptHero - üó®‚îÇgeneral [1030178486398234684]_anonymized.csv

Topic 5: Sample Messages for Review

Topic 2: Sample Messages for Review

Topic 3: Sample Messages for Review

Topic 6: Sample Messages for Review

Topic 4: Sample Messages for Review
Error in pyLDAvis visualization: Object of type complex is not JSON serializable


  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_LimeWire - create AI Images, Audio & Video - Community - üè†Ô∏±general [1051607872695455884] (1)_anonymized.csv

Topic 11: Sample Messages for Review
nothing mate
thx
something grain sand argument philosophy fun
fun
number nothing

Topic 0: Sample Messages for Review
slash command moment
cat haha
description
midjourn difference
charge future cost everything community

Topic 2: Sample Messages for Review
channel deletedchannel
strength weakness aim result deletedchannel
support
deletedchannel rookie channel command imagine
support acount

Topic 12: Sample Messages for Review
command word request word list fate ban deal situation
nsfw cat dragon
promptdiscussion channel
option
idkthis bot bot look

Topic 6: Sample Messages for Review
stylez_morales journey hope
art
community artist art
inspiration artist
way congestion issue


  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_r_StableDiffusion - SD Main - general [1031106064776712226] (2022-12-01 to 2023-02-01)_anonymized.csv

Topic 9: Sample Messages for Review

Topic 10: Sample Messages for Review
parameter seed time result something client
fix
process time developer engineer something min work day
matter fact sound something deeper others issue
anyone automatic1111 api

Topic 8: Sample Messages for Review
information tip user share vae file guy launcher
description page redshift render journey journey dreambooth
inpaint resolution option
storage dataset
luck lol idea someone

Topic 13: Sample Messages for Review
idk
repository info site youtube channel bit ton
channel guide example stuff older everything change minute httpswwwyoutubecomaitrepreneur
attention something anyone monopolization look monopolization card monopoly doe cuda something monopoly corporation utility service health authority example isp mobile carrier rogers

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_r_StableDiffusion - SD Main - general [1031106064776712226] (2024-01-01 to 2024-03-01)_anonymized.csv

Topic 8: Sample Messages for Review
point dlss3 frame gen benefit people world framerates
script size
difference betwen thibaud t2iadapter_xl_openpose
thereabouts default result size
latent size

Topic 0: Sample Messages for Review
version inpainting
version
change version point rocm start
market publicity publicity matter everything controversy people buying mtg
code something

Topic 5: Sample Messages for Review
directml library amd gpus equivalent
game lot scene sketch tech
load random step condition ksampler condition girl
specific example lighting time subject dark
year tour linux process detail side time

Topic 6: Sample Messages for Review
worry development branch discussion thread merge
alright love speedup cudnn library cudnn892 dir torch directory env change performance guidance
extension character

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>