In [2]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import LdaModel

In [3]:
# Load the data from CSV
data = pd.read_csv('twitter_training.csv')

In [4]:
# Preprocess the text data
def preprocess(text):
    return simple_preprocess(text, deacc=True)

In [5]:
# Tokenize the text
tokens = [preprocess(text) for text in data]

In [6]:
# Create a dictionary and a corpus
dictionary = Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

In [7]:
# Apply LDA model
num_topics = 3  # You can adjust the number of topics
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

In [8]:
# Interpret the results
for idx, topic in lda_model.print_topics():
    print("Topic: {} \nWords: {}".format(idx, topic))

Topic: 0 
Words: 0.108*"borderlands" + 0.108*"im" + 0.108*"murder" + 0.108*"will" + 0.108*"and" + 0.108*"getting" + 0.108*"you" + 0.108*"on" + 0.108*"all" + 0.027*"positive"
Topic: 1 
Words: 0.306*"positive" + 0.078*"borderlands" + 0.077*"all" + 0.077*"on" + 0.077*"you" + 0.077*"getting" + 0.077*"and" + 0.077*"im" + 0.077*"murder" + 0.077*"will"
Topic: 2 
Words: 0.306*"borderlands" + 0.077*"positive" + 0.077*"all" + 0.077*"on" + 0.077*"you" + 0.077*"getting" + 0.077*"and" + 0.077*"will" + 0.077*"murder" + 0.077*"im"


In [9]:
# Interpret the results and analyze language for each topic
for idx, topic in lda_model.print_topics():
    print("Topic {}: {}".format(idx, topic))
    print("Top words for this topic:")
    top_words = [word for word, _ in lda_model.show_topic(idx, topn=10)]  # Adjust topn as needed
    print(top_words)
    print("\n")

Topic 0: 0.108*"borderlands" + 0.108*"im" + 0.108*"murder" + 0.108*"will" + 0.108*"and" + 0.108*"getting" + 0.108*"you" + 0.108*"on" + 0.108*"all" + 0.027*"positive"
Top words for this topic:
['borderlands', 'im', 'murder', 'will', 'and', 'getting', 'you', 'on', 'all', 'positive']


Topic 1: 0.306*"positive" + 0.078*"borderlands" + 0.077*"all" + 0.077*"on" + 0.077*"you" + 0.077*"getting" + 0.077*"and" + 0.077*"im" + 0.077*"murder" + 0.077*"will"
Top words for this topic:
['positive', 'borderlands', 'all', 'on', 'you', 'getting', 'and', 'im', 'murder', 'will']


Topic 2: 0.306*"borderlands" + 0.077*"positive" + 0.077*"all" + 0.077*"on" + 0.077*"you" + 0.077*"getting" + 0.077*"and" + 0.077*"will" + 0.077*"murder" + 0.077*"im"
Top words for this topic:
['borderlands', 'positive', 'all', 'on', 'you', 'getting', 'and', 'will', 'murder', 'im']


