In [10]:
from collections import Counter
import pandas as pd
import re

# Load the cleaned tweets dataset
tweets_df = pd.read_csv('tweets_cleaned.csv')

# Load custom stopwords from the provided file
with open('stopwords.txt', 'r') as file:
    custom_stopwords = set(line.strip() for line in file if line.strip())

# Function to remove stopwords from the cleaned text
def remove_stopwords(text):
    tokens = text.split()
    tokens = [word for word in tokens if word not in custom_stopwords]
    return ' '.join(tokens)

# Apply stopwords removal to the cleaned_text column
tweets_df['cleaned_text_no_stopwords'] = tweets_df['cleaned_text'].apply(remove_stopwords)

# Flatten all words from the cleaned_text_no_stopwords into a single list for counting
all_words = [word for text in tweets_df['cleaned_text_no_stopwords'] for word in text.split()]

# Count the frequency of each word using Counter
word_counts = Counter(all_words)

# Convert word counts to a DataFrame for easy analysis
word_counts_df = pd.DataFrame(word_counts.items(), columns=['word', 'count']).sort_values(by='count', ascending=False)

# Display the top 10 most frequent words for verification
print("Top 10 most frequent words:")
print(word_counts_df.head(10))

# Save the word frequencies to a CSV file for Tableau visualization
word_counts_df.to_csv('word_frequencies_no_stopwords.csv', index=False)
print("Word frequencies saved to 'word_frequencies_no_stopwords.csv'.")

# Save the tokenized tweets (without stopwords) for further analysis in Tableau
tweets_df.to_csv('tweets_no_stopwords.csv', index=False)
print("Tokenized tweets without stopwords saved to 'tweets_no_stopwords.csv'.")


Top 10 most frequent words:
         word  count
612     women    148
92   families    147
425      jobs    112
187       job    112
402       tax     96
417   economy     84
560     fight     74
959      wall     71
364    united     67
264     woman     66
Word frequencies saved to 'word_frequencies_no_stopwords.csv'.
Tokenized tweets without stopwords saved to 'tweets_no_stopwords.csv'.
