In [None]:
import pandas as pd
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE

tweets_df = pd.read_csv('../datasets/tweets_no_stopwords.csv')

tweets_df['cleaned_text_no_stopwords'] = tweets_df['cleaned_text_no_stopwords'].fillna('')
tweets_df['cleaned_text_no_stopwords'] = tweets_df['cleaned_text_no_stopwords'].astype(str)

tweets_df['candidate'] = tweets_df['handle'].apply(lambda x: 'Trump' if 'trump' in x.lower() else 'Hillary')

# Convert the 'cleaned_text_no_stopwords' into TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=50)
tfidf_matrix = vectorizer.fit_transform(tweets_df['cleaned_text_no_stopwords'])

# Use t-SNE to reduce dimensions for visualization purposes
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=3000)
reduced_data = tsne.fit_transform(tfidf_matrix.toarray())

# Add the reduced data back to the dataframe for visualization
tweets_df['tsne_x'] = reduced_data[:, 0]
tweets_df['tsne_y'] = reduced_data[:, 1]

# Create an interactive scatter plot using Plotly
fig = px.scatter(
    tweets_df,
    x='tsne_x', 
    y='tsne_y',
    color='candidate',
    color_discrete_map={'Trump': 'red', 'Hillary': 'blue'},
    hover_data=['cleaned_text', 'candidate'],  # Add more metadata to hover
    title='Scatter Plot of Tweets Using t-SNE',
    labels={'tsne_x': 't-SNE Component 1', 'tsne_y': 't-SNE Component 2'},
    width=1920,
    height=1080
)

# Customize the layout for better readability and add lasso select
fig.update_traces(marker=dict(size=6, opacity=0.6), selector=dict(mode='markers'))
fig.update_layout(
    hovermode='closest',
    title_x=0.5,
    xaxis_title='t-SNE Component 1',
    yaxis_title='t-SNE Component 2',
    template='plotly_white'
)

fig.update_layout(dragmode='lasso')

fig.write_html('../theme-selection-scatter-plot/scatter_plot_tnse.html')



'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.

