In [3]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import LdaModel

In [4]:
# Load the data from CSV
data = pd.read_csv('twitter_training.csv')

In [5]:
# Preprocess the text data
def preprocess(text):
    return simple_preprocess(text, deacc=True)

In [11]:
# Tokenize the text
tokens = [preprocess(text) for text in data]

In [12]:
# Create a dictionary and a corpus
dictionary = Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

In [13]:
# Apply LDA model
num_topics = 3  # You can adjust the number of topics
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

In [14]:
# Interpret the results
for idx, topic in lda_model.print_topics():
    print("Topic: {} \nWords: {}".format(idx, topic))

Topic: 0 
Words: 0.104*"positive" + 0.101*"borderlands" + 0.099*"and" + 0.099*"you" + 0.099*"getting" + 0.099*"will" + 0.099*"im" + 0.099*"all" + 0.099*"on" + 0.099*"murder"
Topic: 1 
Words: 0.104*"positive" + 0.101*"borderlands" + 0.099*"on" + 0.099*"and" + 0.099*"im" + 0.099*"murder" + 0.099*"will" + 0.099*"all" + 0.099*"getting" + 0.099*"you"
Topic: 2 
Words: 0.163*"borderlands" + 0.093*"you" + 0.093*"getting" + 0.093*"all" + 0.093*"will" + 0.093*"murder" + 0.093*"im" + 0.093*"and" + 0.093*"on" + 0.091*"positive"


In [15]:
# Interpret the results and analyze language for each topic
for idx, topic in lda_model.print_topics():
    print("Topic {}: {}".format(idx, topic))
    print("Top words for this topic:")
    top_words = [word for word, _ in lda_model.show_topic(idx, topn=10)]  # Adjust topn as needed
    print(top_words)
    print("\n")

Topic 0: 0.104*"positive" + 0.101*"borderlands" + 0.099*"and" + 0.099*"you" + 0.099*"getting" + 0.099*"will" + 0.099*"im" + 0.099*"all" + 0.099*"on" + 0.099*"murder"
Top words for this topic:
['positive', 'borderlands', 'and', 'you', 'getting', 'will', 'im', 'all', 'on', 'murder']


Topic 1: 0.104*"positive" + 0.101*"borderlands" + 0.099*"on" + 0.099*"and" + 0.099*"im" + 0.099*"murder" + 0.099*"will" + 0.099*"all" + 0.099*"getting" + 0.099*"you"
Top words for this topic:
['positive', 'borderlands', 'on', 'and', 'im', 'murder', 'will', 'all', 'getting', 'you']


Topic 2: 0.163*"borderlands" + 0.093*"you" + 0.093*"getting" + 0.093*"all" + 0.093*"will" + 0.093*"murder" + 0.093*"im" + 0.093*"and" + 0.093*"on" + 0.091*"positive"
Top words for this topic:
['borderlands', 'you', 'getting', 'all', 'will', 'murder', 'im', 'and', 'on', 'positive']


