SENTIMENT DISTRIBUTION OF IMDB REVIEWS

In [None]:
!pip install nltk vaderSentiment

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')


In [None]:
from google.colab import files

uploaded = files.upload()


In [None]:
movies = pd.read_csv("tmdb_5000_movies.csv")
reviews = pd.read_csv("IMDB Dataset.csv")


In [None]:
reviews.columns = ['review', 'sentiment']
reviews.dropna(inplace=True)

analyzer = SentimentIntensityAnalyzer()
reviews['compound'] = reviews['review'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
reviews['vader_sentiment'] = reviews['compound'].apply(lambda x: 'positive' if x >= 0.05 else ('negative' if x <= -0.05 else 'neutral'))


In [None]:
sns.countplot(x='vader_sentiment', data=reviews, palette='Set2')
plt.title('Sentiment Distribution of IMDB Reviews')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

Genre-wise Sentiment Analysis

In [None]:
# Check columns
print(movies.columns)
print(reviews.columns)


In [None]:
import random

# Get a list of popular genres from movies dataset
all_genres = set()
for genres in movies['genres']:
    for g in genres.strip('[]').replace('"', '').split(','):
        all_genres.add(g.strip())
genre_list = list(filter(None, all_genres))

# Assign random genres to reviews just for genre-wise sentiment analysis
reviews['genre'] = [random.choice(genre_list) for _ in range(len(reviews))]


In [None]:
# Count of sentiments per genre
genre_sentiment = reviews.groupby(['genre', 'vader_sentiment']).size().unstack().fillna(0)

# Plot
genre_sentiment.plot(kind='bar', stacked=True, figsize=(14, 6), colormap='Set3')
plt.title('Sentiment Distribution by Genre')
plt.xlabel('Genre')
plt.ylabel('Number of Reviews')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


Predict Movie Success (Regression)

In [None]:
# Select features
features = movies[['budget', 'popularity', 'vote_average', 'vote_count', 'runtime', 'revenue']].copy()

# Drop rows with missing or zero values
features = features[(features != 0).all(axis=1)]

# Check data
features.head()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

X = features.drop('revenue', axis=1)
y = features['revenue']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
import numpy as np
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

X = features.drop('revenue', axis=1)
y = features['revenue']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Compute RMSE and R² Score
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))


In [None]:
# Plot feature importances
importances = model.feature_importances_
feat_names = X.columns

sns.barplot(x=importances, y=feat_names, palette='coolwarm')
plt.title('Feature Importance for Revenue Prediction')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


Word Clouds & Text Insights

In [None]:
!pip install wordcloud



In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

# Define stopwords
stopwords = set(STOPWORDS)

# Separate positive and negative reviews
positive_reviews = reviews[reviews['vader_sentiment'] == 'positive']['review']
negative_reviews = reviews[reviews['vader_sentiment'] == 'negative']['review']

# Join all text into single string
pos_text = " ".join(positive_reviews)
neg_text = " ".join(negative_reviews)

# Create word clouds
wordcloud_pos = WordCloud(width=800, height=400, background_color='white', stopwords=stopwords).generate(pos_text)
wordcloud_neg = WordCloud(width=800, height=400, background_color='black', colormap='Reds', stopwords=stopwords).generate(neg_text)

# Plot
plt.figure(figsize=(16, 7))

plt.subplot(1, 2, 1)
plt.imshow(wordcloud_pos, interpolation='bilinear')
plt.axis('off')
plt.title("Positive Reviews Word Cloud", fontsize=16)

plt.subplot(1, 2, 2)
plt.imshow(wordcloud_neg, interpolation='bilinear')
plt.axis('off')
plt.title("Negative Reviews Word Cloud", fontsize=16)

plt.tight_layout()
plt.show()
