In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from wordcloud import WordCloud
from google.colab import drive
drive.mount('/content/drive')
final_merged = pd.read_csv('/content/drive/MyDrive/Applied Deep Learning Poli Sci/Foley and Dorner Repo/Data/final_merged.csv')
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
model = Word2Vec.load('/content/drive/MyDrive/Applied Deep Learning Poli Sci/Foley and Dorner Repo/Models/w2v100_tweets_model.model')
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
#stopwords I added after first iteration
additional_stopwords = {'amp', 'QT', 'RT'}

#somehow, I messed up removing the hashtags, don't have time to fix. the resulting word clouds are basically identical due to this
def clean_and_tokenize(tweet):
    #removes urls
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    #removes file extensions
    tweet = re.sub(r'\S+\.(jpg|jpeg|png|gif|mp4|vid|video|html|htm)', '', tweet)
    #removes some strings
    tweet = re.sub(r'\b(twitter|twimg|media|480x270)\b', '', tweet, flags=re.IGNORECASE)
    #tokenizes
    tokens = word_tokenize(tweet)
    #removes stopwords
    tokens = [word for word in tokens if word.lower() not in stopwords.words('english') and word.lower() not in additional_stopwords]
    #removes non-alphanumeric tokens
    tokens = [word for word in tokens if word.isalnum()]
    return tokens

def tokenize_hashtags(tweet):
    #extracts hashtags
    hashtags = re.findall(r'#\w+', tweet)
    #removes the '#' symbol
    hashtags = [hashtag[1:] for hashtag in hashtags]
    #removes stopwords
    hashtags = [word for word in hashtags if word.lower() not in stopwords.words('english') and word.lower() not in additional_stopwords]
    return hashtags

#runs the functions
final_merged['cleaned_tokens'] = final_merged['text'].apply(clean_and_tokenize)
final_merged['hashtags'] = final_merged['text'].apply(tokenize_hashtags)

print(final_merged[['text', 'cleaned_tokens', 'hashtags']])

In [None]:
#generates word clouds, sorry for lack of comments, doing this in a pinch
def plot_word_cloud(text, title, max_words=50):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()

republican_words = set([word for tokens in final_merged[final_merged['party'] == 'R']['cleaned_tokens'] for word in tokens])
democrat_words = set([word for tokens in final_merged[final_merged['party'] == 'D']['cleaned_tokens'] for word in tokens])
unique_republican_words = republican_words - democrat_words
unique_democrat_words = democrat_words - republican_words

republican_text = ' '.join([' '.join(tokens) for tokens in final_merged[final_merged['party'] == 'R']['cleaned_tokens']])
democrat_text = ' '.join([' '.join(tokens) for tokens in final_merged[final_merged['party'] == 'D']['cleaned_tokens']])
unique_republican_text = ' '.join(unique_republican_words)
unique_democrat_text = ' '.join(unique_democrat_words)

plot_word_cloud(unique_republican_text, 'Unique Words in Republican Tweets')
plot_word_cloud(unique_democrat_text, 'Unique Words in Democrat Tweets')

republican_hashtags = set([hashtag for hashtags in final_merged[final_merged['party'] == 'R']['hashtags'] for hashtag in hashtags])
democrat_hashtags = set([hashtag for hashtags in final_merged[final_merged['party'] == 'D']['hashtags'] for hashtag in hashtags])
unique_republican_hashtags = republican_hashtags - democrat_hashtags
unique_democrat_hashtags = democrat_hashtags - republican_hashtags

republican_hashtag_text = ' '.join([' '.join(hashtags) for hashtags in final_merged[final_merged['party'] == 'R']['hashtags']])
democrat_hashtag_text = ' '.join([' '.join(hashtags) for hashtags in final_merged[final_merged['party'] == 'D']['hashtags']])
unique_republican_hashtag_text = ' '.join(unique_republican_hashtags)
unique_democrat_hashtag_text = ' '.join(unique_democrat_hashtags)

plot_word_cloud(unique_republican_hashtag_text, 'Unique Hashtags in Republican Tweets')
plot_word_cloud(unique_democrat_hashtag_text, 'Unique Hashtags in Democrat Tweets')

In [None]:
from gensim import corpora
from gensim.models import LdaModel

#separates the tweets by party
republican_tweets = final_merged[final_merged['party'] == 'R']['cleaned_tokens']
democrat_tweets = final_merged[final_merged['party'] == 'D']['cleaned_tokens']

#creates a dictionary and corpus for each group
def create_corpus_and_dictionary(tokenized_texts):
    dictionary = corpora.Dictionary(tokenized_texts)
    corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
    return corpus, dictionary

republican_corpus, republican_dictionary = create_corpus_and_dictionary(republican_tweets)
democrat_corpus, democrat_dictionary = create_corpus_and_dictionary(democrat_tweets)

#trains the LDA models
num_topics = 10
republican_lda_model = LdaModel(corpus=republican_corpus, id2word=republican_dictionary, num_topics=num_topics, random_state=42)
democrat_lda_model = LdaModel(corpus=democrat_corpus, id2word=democrat_dictionary, num_topics=num_topics, random_state=42)

In [None]:
#function that displays the topics
def display_topics(model, num_words):
    topics = model.print_topics(num_topics=num_topics, num_words=num_words)
    for topic_num, topic in topics:
        topic_words = topic.split(' + ')
        topic_words = [word.split('*')[1].strip('"') for word in topic_words]
        print(f'Topic {topic_num}: {" ".join(topic_words)}')

print("Republican Topics:")
display_topics(republican_lda_model, 15)
print("\nDemocrat Topics:")
display_topics(democrat_lda_model, 15)