In [6]:
# Import necessary libraries
import pandas as pd
from sklearn.cluster import KMeans
from transformers import pipeline, AutoTokenizer, TFAutoModelForSequenceClassification
import matplotlib.pyplot as plt
import numpy as np

# Load the datasets
barbie_reviews = pd.read_csv('barbie_reviews.csv')
oppenheimer_reviews = pd.read_csv('oppenheimer_reviews.csv')

# Combine datasets for analysis
reviews_df = pd.concat([barbie_reviews, oppenheimer_reviews], ignore_index=True)

# Initialize the tokenizer and model from Transformers
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# Sentiment analysis pipeline
sentiment_pipeline = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

# Apply sentiment analysis
reviews_df['sentiment'] = reviews_df['text'].apply(lambda x: sentiment_pipeline(x)[0]['label'])

# Convert sentiment to numerical for clustering
reviews_df['sentiment_score'] = reviews_df['sentiment'].apply(lambda x: 1 if x == 'POSITIVE' else 0)

# K-Means clustering based on sentiment scores
num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(reviews_df[['sentiment_score']])
reviews_df['cluster'] = kmeans.labels_

# Visualization
# Plot sentiment distribution
plt.figure(figsize=(8, 6))
reviews_df['sentiment'].value_counts().plot(kind='bar')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

# Plot clustering results
plt.figure(figsize=(8, 6))
plt.scatter(reviews_df.index, np.random.rand(len(reviews_df)), c=reviews_df['cluster'])
plt.title('Clustering of Sentiments')
plt.xlabel('Review Index')
plt.ylabel('Random Noise')
plt.show()


KeyboardInterrupt: 