In [4]:
import pandas as pd
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# Load the cleaned tweets dataset
tweets_df = pd.read_csv('tweets_no_stopwords.csv')

# Ensure 'cleaned_text_no_stopwords' column exists and is filled
tweets_df['cleaned_text_no_stopwords'] = tweets_df['cleaned_text_no_stopwords'].fillna('')
tweets_df['cleaned_text_no_stopwords'] = tweets_df['cleaned_text_no_stopwords'].astype(str)

tweets_df['candidate'] = tweets_df['handle'].apply(lambda x: 'Trump' if 'trump' in x.lower() else 'Hillary')

# Step 1: Convert the 'cleaned_text_no_stopwords' into TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=500)
# vectorizer = TfidfVectorizer(max_features=6000)
tfidf_matrix = vectorizer.fit_transform(tweets_df['cleaned_text_no_stopwords'])

# Step 2: Use PCA to reduce dimensions for visualization purposes
pca = PCA(n_components=2, random_state=42)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

# Add the reduced data back to the dataframe for visualization
tweets_df['pca_x'] = reduced_data[:, 0]
tweets_df['pca_y'] = reduced_data[:, 1]

# Step 3: Create an interactive scatter plot using Plotly
fig = px.scatter(
    tweets_df,
    x='pca_x', 
    y='pca_y',
    color='candidate',  # Color points based on the candidate (Trump or Hillary)
    color_discrete_map={'Trump': 'red', 'Hillary': 'blue'},  # Use red for Trump, blue for Hillary
    hover_data=['cleaned_text'],  # Show the tweet content on hover
    title='Interactive Scatter Plot of Tweets',
    labels={'pca_x': 'Principal Component 1', 'pca_y': 'Principal Component 2'},
    width=1920,
    height=1080
)

# Customize the layout for better readability
fig.update_traces(marker=dict(size=6, opacity=0.6))
fig.update_layout(
    hovermode='closest',
    title_x=0.5,
    xaxis_title='Principal Component 1',
    yaxis_title='Principal Component 2',
    template='plotly_white'
)

# Display the interactive plot
# fig.show()
fig.write_html('scatter_plot_pca.html')