In [4]:
!pip install git+https://github.com/rwalk/gsdmm.git
!pip install nltk spacy gensim pyLDAvis networkx wordcloud

import os
import pandas as pd
import numpy as np
import re
import random
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
import nltk
import spacy
import gensim
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import networkx as nx
from collections import Counter
from itertools import combinations
from matplotlib.backends.backend_pdf import PdfPages

# Load Spacy's English model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")

# Mount Google Drive to access the dataset stored there
from google.colab import drive
drive.mount('/content/drive')

# Download necessary NLTK resources for tokenization and POS tagging
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

general_stopwords = {
    'would', 'could', 'should', 'might', 'must', 'will', 'shall', 'can', 'may', 'wouldn', 'couldn', 'shouldn',
    'mightn', 'mustn', 'won', 'shan', 'can', 'mayn', 'just', 'don', 'didn', 'doesn', 'aren', 'isn', 'wasn',
    'weren', 'hasn', 'haven', 'hadn', 'does', 'did', 'don', 'does', 'did', 'don', 'now', 'then', 'once',
    'after', 'before', 'since', 'during', 'while', 'until', 'ago', 'yet', 'still', 'even', 'ever', 'always',
    'never', 'sometimes', 'often', 'usually', 'again', 'too', 'also', 'only', 'really', 'very', 'much',
    'more', 'most', 'many', 'several', 'few', 'some', 'any', 'each', 'every', 'all', 'both', 'either',
    'neither', 'anyone', 'everyone', 'someone', 'nobody', 'noone', 'nothing', 'anything', 'something',
    'everything', 'another', 'such', 'one', 'two', 'three', 'first', 'second', 'third', 'next', 'last',
    'same', 'other', 'different', 'new', 'old', 'young', 'long', 'short', 'high', 'low', 'large', 'small'
}

# Combine all sets of stop words
stop_words = set(stopwords.words('english')) | general_stopwords

# Define the preprocessing function
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    # Remove all non-word characters and lower the text
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Split text into tokens
    tokens = text.split()
    # Apply NER to remove usernames and named entities
    doc = nlp(' '.join(tokens))
    tokens = [token.text for token in doc if not token.ent_type_]
    # POS ÁöÑ‰ΩçÁΩÆÊèêÂâç‰∫ÜÔºåÊú¨Êù•Âú®ÊúÄÂêé
    # Retain only nouns as they are significant for topic identification
    nouns = [word for word, pos in pos_tag(tokens) if pos.startswith('NN')]
    # Á≠õÈÄâÊú¨Êù• lemmatize Âíå len(word) > 2 ÊòØÂú®Âêå‰∏ÄË°åÁöÑÔºåÊúâÂèØËÉΩ lemmatize ‰πãÂêé word length Â∞è‰∫é 2ÔºåÊâÄ‰ª•ËøôÈáåÊääÈïøÂ∫¶Á≠õÈÄâÊîæÂú®ÂêéÈù¢‰∫Ü
    # Lemmatize tokens and remove stop words and short words
    nouns = [lemmatizer.lemmatize(word) for word in nouns]
    nouns = [word for word in nouns if word not in stop_words and len(word) > 2]
    return nouns

# Function to load and preprocess data from CSV files
def load_data(file_path):
    df = pd.read_csv(file_path)
    df['preprocessed'] = df['Content'].apply(preprocess)
    return df['preprocessed'].tolist()

# Path to the folder containing the dataset
folder_path = '/content/drive/My Drive/Discord dataset/Cleaned data/'
file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

# Function to evaluate coherence
def evaluate_coherence(dictionary, docs, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=[dictionary.doc2bow(doc) for doc in docs],
                                                id2word=dictionary,
                                                num_topics=num_topics,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)
        model_list.append(model)
        coherencemodel = gensim.models.CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

# Function to randomly select messages from each topic for manual review
def random_message_selection(docs, best_model, dictionary, num_samples=5):
    topics = best_model.show_topics(num_words=10, formatted=False)
    for topic_id, _ in topics[:5]:  # Limit to top 5 topics
        topic_docs = [doc for doc in docs if any(topic_id == topic[0] for topic in best_model.get_document_topics(dictionary.doc2bow(doc), minimum_probability=0.1))]
        sampled_messages = random.sample(topic_docs, min(len(topic_docs), num_samples))
        print(f"\nTopic {topic_id}: Sample Messages for Review")
        for message in sampled_messages:
            print(' '.join(message))

# Function to save plots to a PDF
def save_plots_to_pdf(file_path, best_model, dictionary, top_clusters):
    pdf_file_name = f'{os.path.basename(file_path)}_report.pdf'
    with PdfPages(pdf_file_name) as pdf:
        for topic_id in top_clusters:
            # Word cloud
            topic_words = dict(best_model.show_topic(topic_id, topn=50))
            wordcloud = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(topic_words)
            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.title(f"Topic {topic_id} Word Cloud for {os.path.basename(file_path)}")
            plt.axis("off")
            pdf.savefig()
            plt.close()

            # Bar chart
            sorted_topic_words = dict(sorted(topic_words.items(), key=lambda item: item[1], reverse=True)[:10])
            plt.figure(figsize=(10, 5))
            plt.bar(sorted_topic_words.keys(), sorted_topic_words.values())
            plt.title(f"Top Words in Topic {topic_id}")
            plt.xticks(rotation=45)
            pdf.savefig()
            plt.close()

            # Network graph
            words = list(sorted_topic_words.keys())
            word_pairs = list(combinations(words, 2))
            G = nx.Graph()
            G.add_edges_from(word_pairs)
            pos = nx.spring_layout(G)
            plt.figure(figsize=(10, 5))
            nx.draw(G, pos, with_labels=True, node_size=50, font_size=10, edge_color='grey')
            plt.title(f"Word Co-occurrence Network for Topic {topic_id}")
            pdf.savefig()
            plt.close()

    return pdf_file_name

# Process each file, perform topic modeling, and generate visualizations
for file_path in file_paths:
    print(f"Processing file: {file_path}")
    docs = load_data(file_path)
    # Remove any empty documents resulting from preprocessing
    docs = [doc for doc in docs if doc]

    if len(docs) > 0:
        # Create a dictionary of all words in documents
        dictionary = gensim.corpora.Dictionary(docs)
        # Remove extremes to focus on relevant words only
        dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=10000)
        vocab_length = len(dictionary)

        if vocab_length == 0:
            print("No vocabulary available after filtering extremes. Skipping.")
            continue

        # Evaluate coherence for different number of topics
        model_list, coherence_values = evaluate_coherence(dictionary, docs, limit=20)  # Increase to explore more topics

        # Select the model with the highest coherence
        best_model_index = coherence_values.index(max(coherence_values))
        best_model = model_list[best_model_index]

        # Random message selection for validation
        random_message_selection(docs, best_model, dictionary)

        # Visualize the topics using pyLDAvis
        try:
            lda_display = gensimvis.prepare(best_model, [dictionary.doc2bow(doc) for doc in docs], dictionary)
            pyLDAvis.display(lda_display)
        except Exception as e:
            print(f"Error in pyLDAvis visualization: {e}")

        # Generate and display word clouds for each of the top 5 topics
        sorted_clusters = sorted(range(best_model.num_topics), key=lambda k: sum([tup[1] for tup in best_model.get_topic_terms(k)]), reverse=True)
        top_clusters = sorted_clusters[:5]

        # Save plots to PDF
        pdf_file_name = save_plots_to_pdf(file_path, best_model, dictionary, top_clusters)

        # Provide a link to download the PDF file
        from google.colab import files
        files.download(pdf_file_name)
    else:
        print("Not enough data to build the model or all documents are empty.")
















Collecting git+https://github.com/rwalk/gsdmm.git
  Cloning https://github.com/rwalk/gsdmm.git to /tmp/pip-req-build-6f79dikx
  Running command git clone --filter=blob:none --quiet https://github.com/rwalk/gsdmm.git /tmp/pip-req-build-6f79dikx
  Resolved https://github.com/rwalk/gsdmm.git to commit 4ad1b6b6976743681ee4976b4573463d359214ee
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gsdmm
  Building wheel for gsdmm (setup.py) ... [?25l[?25hdone
  Created wheel for gsdmm: filename=gsdmm-0.1-py3-none-any.whl size=4586 sha256=7f8c6a64ad1cde75e8e814cfcd4761c05ce31957b82789459c20ff99f0e3f4fe
  Stored in directory: /tmp/pip-ephem-wheel-cache-fhjf7v5w/wheels/da/d3/6e/a612d7cff0fcfb6470b8c113fc04931ecffb466ac19b9c5f3c
Successfully built gsdmm
Installing collected packages: gsdmm
Successfully installed gsdmm-0.1
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading func

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_Midjourney - Chat - discussion [938713143759216720] (2022-01-01 to 2022-07-01)_anonymized.csv

Topic 16: Sample Messages for Review
people mind midjourney hahah sorry party
place picture frame haha
bug yeah
heck yeah
yeah bit face restoration photoshopping

Topic 13: Sample Messages for Review
aspect ratio
try size
image text
theory support amount text time response
size credit plan

Topic 10: Sample Messages for Review
thanks
thanks shot
yeah thanks website
sense thanks ancient
channel request kind goof

Topic 0: Sample Messages for Review
bet gallery discord option
word yes
people issue month option
word
list word

Topic 15: Sample Messages for Review
plan credit thinkin
job error memory image
credit trial upscale variation command credit credit
thing error problem render time relax mode thing people result
job menu work


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_PromptHero - PromptHero - üó®‚îÇgeneral [1030178486398234684] (2022-10-13 to 2023-01-01)_anonymized.csv

Topic 4: Sample Messages for Review
hey thought model base openjourney finetunes base complaint lot problem people
hey journey diffusion
hey course
hey model vaes gradient time button list custom model search tool download download hussle
hey franck idea dreambooth fine model image modelckpt image subject resource lot people service gpu process repo httpsgithubcomjoepennadreamboothstablediffusion repo guide training subreddit ton knowledge google colabs httpswwwredditcomrdreamboothcommentsxnycm0dreambooth_implementation_with_stable_diffusion

Topic 14: Sample Messages for Review
issue room guide link
yeah model card link colab httpscolabresearchgooglecomdrive1vkuxkkesynyi2olzm8mrwqcokqtsurmuspsharing
couple link colab hugging face
ran news stability model link transcript emad discord httpswwwredditcomrsta

  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_Maze Guru - AI Art, Anime & Social - Chat - üí¨ÔΩúenglish-discussion [1037291441434677268] (2024-01-01 to 2024-03-01)_anonymized.csv

Topic 0: Sample Messages for Review
interface return
type result
detail
picture
time gore fluffy pony

Topic 1: Sample Messages for Review
dalee style credit
sorry link tolerance pedos screenshots
provide
let plane warhead
nsfw server post content thank


  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_Maze Guru - AI Art, Anime & Social - Chat - üí¨ÔΩúenglish-discussion [1037291441434677268] (2022-11-01 to 2023-04-01)_anonymized.csv

Topic 10: Sample Messages for Review

Topic 3: Sample Messages for Review

Topic 8: Sample Messages for Review
thanks
thanks concern
thanks lotwe feedback team problem
thanks share
thanks bot

Topic 11: Sample Messages for Review
image consent artist
user choice command worry stay info discord
cause user link look creation user
art service artist artist life
user post number evaluation area worry

Topic 15: Sample Messages for Review
bot right server problem display manner
question mtgs norn propts diva featureless face headcrest result negative featureless face work cuz diva
result
usage right huggingface
upscaler result guy look


  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_üëΩ WOMBOVERSE üëΩ - „Ää üëΩ„ÄãLOUNGE - üí¨‚îÇgeneral-chat [774124295524712480] (2024-01-01 to 2024-03-01)_anonymized.csv

Topic 6: Sample Messages for Review
coffee aka robusta
course fault
thanks
thanks semester test scription
thank

Topic 1: Sample Messages for Review
point
game
grotesque cost character ill life
people marmite others
word post

Topic 2: Sample Messages for Review
mine designer lexica
app iphone problem superslow rendering lot error
mine
back
server room server bot server use room image

Topic 7: Sample Messages for Review
hey lou employee dream
hey command wombot2 channel
work skit
android app memory cache device device app login info
hello skit video command plan moment

Topic 9: Sample Messages for Review
bun facebook please
tommy
yeah bun chaos lord doubt desk
fun
suggestion team people look


  pdf.savefig()
  pdf.savefig()
  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_üëΩ WOMBOVERSE üëΩ - „Ää üëΩ„ÄãLOUNGE - üí¨‚îÇgeneral-chat [774124295524712480] (2020-11-01 to 2021-12-01)_anonymized.csv

Topic 4: Sample Messages for Review
portal escape message server rickroll people
role traffic team ping
thank
issue message user
optifine boosting mod lmao

Topic 3: Sample Messages for Review
search wombo
name
dam amrit name
chromebook
bit bruh

Topic 0: Sample Messages for Review
feedback
guy lobby self
hmmmmmm ill ask
register implementation template motor
thanks saturday

Topic 6: Sample Messages for Review
question wombo song war song
app emulator ldplayer bluestacks version app result httpsyoutubevgbycvwdina
app emulator ldplayer bluestacks version app result httpsyoutubevgbycvwdina
watermark
app emulator ldplayer bluestacks version app result httpsyoutubevgbycvwdina

Topic 10: Sample Messages for Review
cool
bit work lol
ill limit work lol
chug jug leviathan
chug jug leviathan


  pdf.savefig()
  pdf.savefig()
  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_Midjourney - Chat - discussion [938713143759216720] (2022-02-01 to 2022-04-01)_anonymized.csv

Topic 5: Sample Messages for Review
result work
issue gpus database bottleneck
benefit result shitpost channel
word wound word wound model
model resolution future

Topic 8: Sample Messages for Review
quirk image bot server image code method inbox pngs title
resolution type thing
yep stuff end bug
reference dream comment bot room haha
type video people quality

Topic 6: Sample Messages for Review
wait people image command
prompt guide message deletedchannel page
hackin uppercase result berk word token capital letter
word style others
yeah love contact bunch people social share credit community link right

Topic 7: Sample Messages for Review
artist
designer simple trick
people prompt generation
future idea game time variation target generation portrait character fun picking variation target image
beginning channel cou

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_LimeWire - create AI Images, Audio & Video - Community - üè†Ô∏±general [1051607872695455884]_anonymized.csv

Topic 0: Sample Messages for Review
auto mod discord limewire message
point developer api image creation day
sir
team type information date date delay explanation
wanna speraetion topic future system creator team

Topic 1: Sample Messages for Review
limers
image please answer thanks advance
limewire holder nft platform get pro status
atif
web3

Topic 2: Sample Messages for Review
guy image show server
lwolimewireoriginals server supply rarity
people server
zendesk support issue ticket
place weather degree

Topic 3: Sample Messages for Review
hey question ama
yeahh
help let limewire
studio image httpslimewirecomstudio
link image image img2img feature

Topic 4: Sample Messages for Review
increase ticket change
market
lot video icon change miss
reallyy
moon


  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_PromptHero - PromptHero - üó®‚îÇgeneral [1030178486398234684]_anonymized.csv

Topic 0: Sample Messages for Review
gimme midjourney please
prompt
hey access course mod power access access channel course
upload picture cracked marble
macaroni

Topic 1: Sample Messages for Review
artist story step dream goal concept artist game company gamma minus test lead artist concept artist crew game title bit book foundation stuff life life time program midjourney others head love program dream concept artist industry project game film medium imagination kid video game film feeling art generative program creativity company project instagram twitter adventure professional work studio
afternoon image interior warehouse bean
area website post nsfw content
copilot
hey project llm rag text2image prompt user prompt vectordb coder guy anybody amount prompt image vectordb touch number number prompt prompt gold


  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_LimeWire - create AI Images, Audio & Video - Community - üè†Ô∏±general [1051607872695455884] (1)_anonymized.csv

Topic 0: Sample Messages for Review
antobloom work
difference people rookie move branch channel
feature
man lol
hello guide deletedchannel thank

Topic 1: Sample Messages for Review
ngl lol
avatar image
point fee cost pocket
look lead
yes

Topic 2: Sample Messages for Review
perspective style model prompt prompt style
server message invite link
time error
imagine prompt water world

Topic 3: Sample Messages for Review
man
doofy server
mention function issue server
image image link prompt
use image way upload image discord imgur service

Topic 4: Sample Messages for Review
descriptives link
sorry mod info
organization
click image press browser link
command photo


  pdf.savefig()
  pdf.savefig()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_r_StableDiffusion - SD Main - general [1031106064776712226] (2022-12-01 to 2023-02-01)_anonymized.csv

Topic 9: Sample Messages for Review
dunno mind school quality duke voice
arch linux cuda xformers fine
grid plot script grid
art videogame character lol overwatch stuff existence skin trait character bit
art server rule

Topic 3: Sample Messages for Review
takedown regardless reason action
nope guess pic
message version torch version choice torchvision version system venv scratch extension dependency extension comment dependency developer knowledge python care issue
part embeds mix time
painting style picture color way haha

Topic 7: Sample Messages for Review
firespam sub moment mod post flooding
mcmonkey spaceship model subreddit httpshuggingfacecocorruptlakescifidiffusion
got github tos
embeds checkpoint
technology cause wave noooo

Topic 1: Sample Messages for Review
map time cursor algo get x10 cheeers 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing file: /content/drive/My Drive/Discord dataset/Cleaned data/cleaned_r_StableDiffusion - SD Main - general [1031106064776712226] (2024-01-01 to 2024-03-01)_anonymized.csv

Topic 13: Sample Messages for Review
shit
animation environment physic
default message tcmalloc startup
setup lcm otherwise default setup
animatediffsvd animation trade accuracy quality detailers allows animation line problem

Topic 10: Sample Messages for Review
tbh motion director net course
software engineering mode type software design mode user default setting template change theme selector mode hand user experience product testing software design concept pattern skill productivity career prospect principle design pattern design microservices software type bug reporting acceptance regression sanity permission rest api compatibility load testing security testing3 citation httpsstudycomacademylessondesignconceptsinsoftwareengineeringtypesexampleshtml
image web attention mask nose photoshop process denoise

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import os
import pandas as pd
import numpy as np
import re
import random
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
import nltk
import spacy
import gensim
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import networkx as nx
from collections import Counter
from itertools import combinations
from matplotlib.backends.backend_pdf import PdfPages

# Load Spacy's English model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")

# Mount Google Drive to access the dataset stored there
from google.colab import drive
# drive.mount('/content/drive')

# Download necessary NLTK resources for tokenization and POS tagging
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

general_stopwords = {
    'would', 'could', 'should', 'might', 'must', 'will', 'shall', 'can', 'may', 'wouldn', 'couldn', 'shouldn',
    'mightn', 'mustn', 'won', 'shan', 'can', 'mayn', 'just', 'don', 'didn', 'doesn', 'aren', 'isn', 'wasn',
    'weren', 'hasn', 'haven', 'hadn', 'does', 'did', 'don', 'does', 'did', 'don', 'now', 'then', 'once',
    'after', 'before', 'since', 'during', 'while', 'until', 'ago', 'yet', 'still', 'even', 'ever', 'always',
    'never', 'sometimes', 'often', 'usually', 'again', 'too', 'also', 'only', 'really', 'very', 'much',
    'more', 'most', 'many', 'several', 'few', 'some', 'any', 'each', 'every', 'all', 'both', 'either',
    'neither', 'anyone', 'everyone', 'someone', 'nobody', 'noone', 'nothing', 'anything', 'something',
    'everything', 'another', 'such', 'one', 'two', 'three', 'first', 'second', 'third', 'next', 'last',
    'same', 'other', 'different', 'new', 'old', 'young', 'long', 'short', 'high', 'low', 'large', 'small'
}

# Combine all sets of stop words
stop_words = set(stopwords.words('english')) | general_stopwords

# Define the preprocessing function
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    # Remove all non-word characters and lower the text
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Split text into tokens
    tokens = text.split()
    # Apply NER to remove usernames and named entities
    doc = nlp(' '.join(tokens))
    tokens = [token.text for token in doc if not token.ent_type_]
    # Retain only nouns as they are significant for topic identification
    nouns = [word for word, pos in pos_tag(tokens) if pos.startswith('NN')]
    if 'look' in tokens:
      print(text)
      print(list(pos_tag(tokens)))
    # Lemmatize tokens and remove stop words and short words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    return nouns

# Function to load and preprocess data from CSV files
def load_data(file_path):
    df = pd.read_csv(file_path)
    df['preprocessed'] = df['Content'].apply(preprocess)
    return df['preprocessed'].tolist()

load_data('data.csv')

  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


the capitalism art earlier was pretty cool  interesting to see what ai thinks capitalism looks like
[('capitalism', 'NN'), ('art', 'NN'), ('earlier', 'RBR'), ('pretty', 'RB'), ('cool', 'JJ'), ('interesting', 'VBG'), ('see', 'NN'), ('think', 'VBP'), ('capitalism', 'NN'), ('look', 'VBP'), ('like', 'IN')]
if you have problem running command just look at deletedchannel
[('problem', 'NN'), ('running', 'VBG'), ('command', 'NN'), ('look', 'NN'), ('deletedchannel', 'NN')]
so can you please tell me how i can get the thing i want to actually look like what i want
[('please', 'VB'), ('tell', 'VB'), ('get', 'VB'), ('thing', 'NN'), ('want', 'VBP'), ('actually', 'RB'), ('look', 'VBP'), ('like', 'IN'), ('want', 'NN')]
i mean i just joined a few minutes ago but it looks to be free
[('mean', 'NN'), ('joined', 'VBD'), ('minute', 'JJ'), ('look', 'NN'), ('free', 'JJ')]
idkthis bot and midjourney bot look exactly the same
[('idkthis', 'NN'), ('bot', 'NN'), ('midjourney', 'NN'), ('bot', 'NN'), ('look', 'NN'

[[],
 [],
 ['capitalism', 'art', 'see', 'capitalism'],
 ['yeah', 'think'],
 ['art', 'competition', 'vote', 'piece', 'winner', 'card', 'thought'],
 ['heck', 'yeah'],
 ['member', 'discord'],
 ['sound'],
 ['fun'],
 [],
 [],
 ['server'],
 [],
 [],
 ['fun', 'friend'],
 ['awesome'],
 [],
 ['brain'],
 ['niche', 'focus'],
 ['focus', 'panda'],
 ['cat'],
 ['mine', 'thing'],
 ['enjoy', 'pixar', 'spin', 'thinfs'],
 ['niche', 'imagination', 'nt'],
 ['boundary'],
 ['word'],
 ['list', 'word', 'space', 'creation', 'ideation', 'balance'],
 ['way', 'room'],
 ['hey'],
 ['imagine'],
 ['channel'],
 ['thanks'],
 ['see', 'people', 'image', 'command'],
 ['find', 'people', 'service', 'ban'],
 ['make', 'sentence'],
 ['race', 'car', 'track', 'sir'],
 ['speech', 'business', 'service'],
 ['nt', 'term', 'service'],
 ['diaper', 'tos'],
 ['friend'],
 ['image', 'service'],
 ['please', 'refrain', 'nudity'],
 ['try', 'use', 'sense', 'prompt'],
 ['see', 'nudity', 'place', 'bro'],
 ['create', 'nudity', 'please', 'result']