In [1]:
import pandas as pd
import re

# Load the cleaned tweets dataset
tweets_df = pd.read_csv('../datasets/tweets_no_stopwords.csv')

# Ensure 'cleaned_text_no_stopwords' column exists and is filled
tweets_df['cleaned_text_no_stopwords'] = tweets_df['cleaned_text_no_stopwords'].fillna('')
tweets_df['cleaned_text_no_stopwords'] = tweets_df['cleaned_text_no_stopwords'].astype(str)

# Define themes and their associated keywords
theme_keywords = {
    "Economy": ["tax", "money", "economy", "jobs", "wage"],
    "Families": ["families", "kids", "children", "parents"],
    "Foreign Policy": ["immigration", "foreign policy", "border"],
    "Women": ["women", "rights", "gender"]
}

# Function to assign multiple themes based on the presence of keywords
def assign_themes(text):
    assigned_themes = []
    for theme, keywords in theme_keywords.items():
        if any(keyword in text.lower() for keyword in keywords):
            assigned_themes.append(theme)
    # If no themes were matched, assign 'Other'
    return assigned_themes if assigned_themes else ["Other"]

# Apply the function to assign themes to the tweets, allowing for multiple themes
tweets_df['themes'] = tweets_df['cleaned_text_no_stopwords'].apply(assign_themes)

# Create a new column 'is_repeated' to indicate if a tweet has multiple themes
tweets_df['is_repeated'] = tweets_df['themes'].apply(lambda x: 'Yes' if len(x) > 1 else 'No')

# Expand the themes into multiple rows for each tweet-theme combination
expanded_df = tweets_df.explode('themes')

# Save the expanded DataFrame for further analysis or use in Tableau
expanded_df.to_csv('../datasets/tweets_with_themes_expanded.csv', index=False)


Updated file saved at: tweets_with_themes_expanded.csv
