In [None]:
import pandas as pd
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE

tweets_df = pd.read_csv('../datasets/tweets_no_stopwords.csv')

tweets_df['cleaned_text_no_stopwords'] = tweets_df['cleaned_text_no_stopwords'].fillna('')
tweets_df['cleaned_text_no_stopwords'] = tweets_df['cleaned_text_no_stopwords'].astype(str)

# Define themes and their associated keywords
theme_keywords = {
    "Economy": ["tax", "money", "economy", "jobs", "wage"],
    "Families": ["families", "kids", "children", "parents"],
    "Foreign Policy": ["immigration", "foreign policy", "border"],
    "Women": ["women", "rights", "gender"]
}

# Function to assign multiple themes based on the presence of keywords
def assign_themes(text):
    assigned_themes = []
    for theme, keywords in theme_keywords.items():
        if any(keyword in text.lower() for keyword in keywords):
            assigned_themes.append(theme)
    # If no themes were matched, assign 'Other'
    return assigned_themes if assigned_themes else ["Other"]

tweets_df['themes'] = tweets_df['cleaned_text_no_stopwords'].apply(assign_themes)

expanded_df = tweets_df.explode('themes')

engagement_summary = expanded_df.groupby('themes')[['retweet_count', 'favorite_count']].mean().reset_index()
engagement_summary.columns = ['Theme', 'Average Retweets', 'Average Likes']

print(engagement_summary)

fig = px.bar(
    engagement_summary,
    x='Theme',
    y=['Average Retweets', 'Average Likes'],
    title='Average Engagement by Theme',
    labels={'value': 'Average Count', 'variable': 'Engagement Type'},
    barmode='group',
    height=600,
    width=800
)

fig.show()

tweets_df.to_csv('../datasets/tweets_with_themes.csv', index=False)

            Theme  Average Retweets  Average Likes
0         Economy       3979.029491   10052.654155
1        Families       2850.070866    7540.877953
2  Foreign Policy       5207.383562   13788.780822
3           Other       4472.876237   11915.255801
4           Women       4324.062500   10349.011364


Multiple themes assigned and saved to 'tweets_with_multiple_themes.csv'.
