In [None]:
# Movie Review Sentiment Analyzer

# This notebook demonstrates a sentiment analysis pipeline on a dataset of movie reviews.
# We apply natural language processing (NLP) techniques to classify reviews as positive or negative.

# **Key steps:**
# - Load and explore the dataset
# - Text preprocessing and feature extraction
# - Sentiment scoring with VADER
# - Model building and evaluation

# Libraries used include: `pandas`, `scikit-learn`, `NLTK`, `matplotlib`, `statsmodels`, and more.

# > Note: This notebook was originally created as part of my undergraduate coursework. The code and results are from an academic project and demonstrate foundational sentiment analysis techniques using classical tools.

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import matplotlib.ticker as ticker

# Statistical analysis
import statsmodels.api as sm
from scipy.stats import skew

# NLP
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# ML preprocessing
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_csv('merged_file.csv', index_col=0)

In [None]:
# READ IN CSV FILES AND MERGE
movie_info = pd.read_csv('movie_info.csv')
critic_reviews = pd.read_csv('CriticReviews_2018-2020.csv')
# Merge the two dataframes on the 'review_object_title' column
merged_df = pd.merge(movie_info, critic_reviews, on='review_object_title', how='outer', suffixes=('_movie', '_review'))

merged_df.head()

In [None]:
# RENAME MovieID and MovieTitle
merged_df = merged_df.rename(columns={'Unnamed: 0': 'Movie_ID', 'review_object_title': 'Movie_Title'})

# MOVIE ID
# Reset the MovieID column to start at 1 for each unique MovieTitle
merged_df['Movie_ID'] = merged_df.groupby('Movie_Title').ngroup() + 1
#merged_df.head()

In [None]:
# create a column indicating how much of a row is missing
# Create a boolean mask indicating which cells contain Not available, NaN, or the string 'Not available'
na_mask = merged_df.isna() | merged_df.isin(['Not available'])

# Count the number of cells in each row that contain Not available, NaN, or 'Not available'
na_counts = na_mask.sum(axis=1)

# Calculate the percentage of cells in each row that contain Not available, NaN, or 'Not available'
na_percent = na_counts / len(merged_df.columns) * 100

# Add the new column to the DataFrame
merged_df['NA_Percent'] = na_percent
#merged_df

In [None]:
# Sort the DataFrame by NA_Percent in descending order
merged_df.sort_values('NA_Percent', ascending=False)

In [None]:
# remove rows that have more than 50% Na values. These rows don't have enough information to use useful in a sentiment analysis. 
#Also, Not available of these columns contain the important review or reviewer rating necessary for the analysis. 
merged_df = merged_df[merged_df['NA_Percent'] <= 50]
merged_df
#319 rows have been removed.

In [None]:
# drop the NA_Percent column
merged_df = merged_df.drop('NA_Percent', axis=1)
#merged_df

# CHECK FOR NAs in columns 
# Calculate the percentage of NaN values for each column
col_na_percentage = merged_df.isna().sum() / merged_df.shape[0] * 100
# Add a new row at the top of the DataFrame with the column NaN percentages
merged_df.loc[-1] = col_na_percentage.round(2)
merged_df.index = merged_df.index + 1
merged_df = merged_df.sort_index()
merged_df.head(2)

In [None]:
# drop the NA_Percent column
merged_df = merged_df.drop('NA_Percent', axis=1)
#merged_df

# CHECK FOR NAs in columns 
# Calculate the percentage of NaN values for each column
col_na_percentage = merged_df.isna().sum() / merged_df.shape[0] * 100
# Add a new row at the top of the DataFrame with the column NaN percentages
merged_df.loc[-1] = col_na_percentage.round(2)
merged_df.index = merged_df.index + 1
merged_df = merged_df.sort_index()
merged_df.head(2)

In [None]:
# BUDGET
# 8.12% NA
# drop duplicate column
merged_df.drop('tmdb_budget', axis=1, inplace=True)
# rename column
merged_df.rename(columns={'budget': 'Budget'}, inplace=True)
# calculate the skew of the 'Budget' column
Var_skew = skew(merged_df['Budget'].dropna())
Var_skew
# skew is 2.72 so use median to impute NA's or 0's
merged_df['Budget'] = merged_df['Budget'].replace(0, np.nan)
median_budget = merged_df['Budget'].median(skipna=True)
merged_df['Budget'] = merged_df['Budget'].fillna(median_budget)
merged_df.head(100)

In [None]:
# RUNTIME
# 8.24% NA
# rename column
merged_df = merged_df.rename(columns={'runtime': 'Runtime'})
# calculate the skew of the 'Runtime' column
Var_skew = skew(merged_df['Runtime'].dropna())
Var_skew
# skew is 3.46 so use median to impute NA's or 0's
merged_df['Runtime'] = merged_df['Runtime'].replace(0, np.nan)
median_runtime = merged_df['Runtime'].median(skipna=True)
merged_df['Runtime'] = merged_df['Runtime'].fillna(median_runtime)
merged_df.head(100)

In [None]:
# ORIGINAL LANGUAGE
# 8.12% NA
# rename column
merged_df = merged_df.rename(columns={'original_language': 'Original_Language'})
# impute NAs with Not available
merged_df['Original_Language'].fillna(value='Not available', inplace=True)
# rename abbreviated values with full language name
language_map = {
    'Not available': 'Not available',
    'en': 'English',
    'zh': 'Chinese',
    'sv': 'Swedish',
    'es': 'Spanish',
    'de': 'German',
    'fr': 'French',
    'xx': 'Not available',
    'it': 'Italian',
    'ka': 'Georgian',
    'cs': 'Czech',
    'fa': 'Persian',
    'ro': 'Romanian',
    'ja': 'Japanese',
    'ar': 'Arabic',
    'id': 'Indonesian',
    'hu': 'Hungarian',
    'tl': 'Tagalog',
    'pl': 'Polish',
    'sw': 'Swahili',
    'no': 'Norwegian',
    'pt': 'Portuguese',
    'he': 'Hebrew',
    'vi': 'Vietnamese',
    'hi': 'Hindi',
    'ru': 'Russian',
    'af': 'Afrikaans',
    'cn': 'Not available',
    'ko': 'Korean',
    'tr': 'Turkish',
    'az': 'Azerbaijani',
    'uk': 'Ukrainian',
    'ga': 'Irish',
    'as': 'Assamese',
    'lv': 'Latvian',
    'th': 'Thai',
    'el': 'Greek',
    'da': 'Danish',
    'nl': 'Dutch',
    'st': 'Southern Sotho',
    'ky': 'Kyrgyz',
    'fi': 'Finnish',
    'is': 'Icelandic',
    'ak': 'Akan',
    'bn': 'Bengali',
    'ml': 'Malayalam',
    'hy': 'Armenian',
    'am': 'Amharic',
    'dz': 'Dzongkha',
    'si': 'Sinhala',
    'ln': 'Lingala',
    'ur': 'Urdu',
    'mn': 'Mongolian',
    'la': 'Latin',
    'te': 'Telugu',
    'bs': 'Bosnian',
    'bg': 'Bulgarian',
    'ca': 'Catalan',
    'kk': 'Kazakh',
    'ne': 'Nepali',
    'lt': 'Lithuanian',
    'ta': 'Tamil',
    'ms': 'Malay',
    'wo': 'Wolof',
    'eu': 'Basque',
    'pa': 'Punjabi',
    'mr': 'Marathi',
    'hr': 'Croatian',
    'mk': 'Macedonian',
    'sq': 'Albanian',
    'sr': 'Serbian'
}

merged_df['Original_Language'] = merged_df['Original_Language'].map(language_map)

In [None]:
# REVENUE
# 8.12% NA
# rename column
merged_df.rename(columns={'tmdb_revenue': 'Revenue'}, inplace=True)
# calculate the skew of the 'Revenue' column
Var_skew = skew(merged_df['Revenue'].dropna())
Var_skew
# skew is 4.83 so use median to impute NA's or 0's
merged_df['Revenue'] = merged_df['Revenue'].replace(0, np.nan)
median_revenue = merged_df['Revenue'].median(skipna=True)
merged_df['Revenue'] = merged_df['Revenue'].fillna(median_revenue)
merged_df.head(100)

In [None]:
# RELEASE DATE
# 8.36% NA
# rename column
merged_df = merged_df.rename(columns={'release_date': 'Release_Date'})
# impute NAs with Not available
merged_df['Release_Date'].fillna(value='Not available', inplace=True)
merged_df['Release_Date'].head(100)

In [None]:
# POPULARITY
# 8.12% NA
# rename column
merged_df.rename(columns={'tmdb_popularity': 'Popularity'}, inplace=True)
# calculate the skew of the 'Popularity' column
Var_skew = skew(merged_df['Popularity'].dropna())
print(Var_skew)
# skew is 17 so use median to impute NA's or 0's
merged_df['Popularity'] = merged_df['Popularity'].replace(0, np.nan)
median_popularity = merged_df['Popularity'].median(skipna=True)
merged_df['Popularity'] = merged_df['Popularity'].fillna(median_popularity)
merged_df['Popularity'].head(100)
print(f"Minimum value: {merged_df['Popularity'].min()}")
print(f"Maximum value: {merged_df['Popularity'].max()}")

In [None]:
# VOTE AVGERAGE
# 8.12% NA
# rename column
merged_df.rename(columns={'tmdb_vote_avg': 'Vote_Average'}, inplace=True)
# calculate the skew of the 'Vote_Average' column
Var_skew = skew(merged_df['Vote_Average'].dropna())
print(Var_skew)
# skew is -2.79 so use median to impute NA's or 0's
merged_df['Vote_Average'] = merged_df['Vote_Average'].replace(0, np.nan)
median_vote_average = merged_df['Vote_Average'].median(skipna=True)
merged_df['Vote_Average'] = merged_df['Vote_Average'].fillna(median_vote_average)
merged_df['Vote_Average'].head(100)
print(f"Minimum value: {merged_df['Vote_Average'].min()}")
print(f"Maximum value: {merged_df['Vote_Average'].max()}")

In [None]:
# VOTE COUNT
# 8.12% NA
# rename column
merged_df.rename(columns={'tmdb_vote_count': 'Vote_Count'}, inplace=True)
# calculate the skew of the 'Vote_Count' column
Var_skew = skew(merged_df['Vote_Count'].dropna())
print(Var_skew)
# skew is 3.08 so use median to impute NA's or 0's
merged_df['Vote_Count'] = merged_df['Vote_Count'].replace(0, np.nan)
median_vote_count = merged_df['Vote_Count'].median(skipna=True)
merged_df['Vote_Count'] = merged_df['Vote_Count'].fillna(median_vote_count)
merged_df['Vote_Count'].head(100)
print(f"Minimum value: {merged_df['Vote_Count'].min()}")
print(f"Maximum value: {merged_df['Vote_Count'].max()}")

In [None]:
# TMDBID and INDEX
# drop columns because we made our own index called MovieID
merged_df.drop(['tmdbid', 'index'], axis=1, inplace=True)

In [None]:
# REVIEW ID
# 0 NAs
# rename column
merged_df = merged_df.rename(columns={'review_id': 'Review_ID'})
# convert to string data type
merged_df['Review_ID'] = merged_df['Review_ID'].astype(str)
merged_df['Review_ID'].head(100)

In [None]:
# CRITIC ID
# 0 NAs
# rename column
merged_df = merged_df.rename(columns={'critic_id': 'Critic_ID'})
# convert to string data type
merged_df['Critic_ID'] = merged_df['Critic_ID'].astype(str)
merged_df['Critic_ID'].head(100)

In [None]:
# CREATED DATE
# 0 NAs
# rename column
merged_df = merged_df.rename(columns={'created_date': 'Created_Date'})
# convert from numeric to date time format
def convert_to_date(date_str):
    if date_str != '':
        try:
            return pd.to_datetime(date_str, format='%Y%m%d').date().strftime('%m/%d/%Y')
        except ValueError:
            return 'Not available'
    else:
        return 'Not available'
# apply function
merged_df['Created_Date'] = merged_df['Created_Date'].apply(convert_to_date)
merged_df['Created_Date'].head(100)

In [None]:
# PUBLISHED DATE
# 0 NAs
# rename column
merged_df = merged_df.rename(columns={'pub_date': 'Published_Date'})
# apply convert date function to convert from numeric to date time format
merged_df['Published_Date'] = merged_df['Published_Date'].apply(convert_to_date)
merged_df['Published_Date'].head(100)

In [None]:
# CONTENT
# 0.01% NA
# rename column
merged_df = merged_df.rename(columns={'content': 'Content'})
# impute NAs with Not available
merged_df['Content'].fillna(value='Not available', inplace=True)
# drop rows with NAs
# merged_df = merged_df.dropna(subset=['Content'])
merged_df['Content'].head(100)

In [None]:
# PUBLISHER
# 0 NAs
# rename column
merged_df = merged_df.rename(columns={'publisher': 'Publisher'})
merged_df['Publisher'].head(100)

In [None]:
# REVIEWER RATING ROTTEN
# 0 NAs
# drop because it only has values TRUE
merged_df.drop('reviewer_rating_rotten', axis=1, inplace=True)

In [None]:
# rename column
merged_df = merged_df.rename(columns={'reviewer_rating_actual': 'Reviewer_Rating_Actual'})

In [None]:
# convert letter grades to standardized score. the code creates a function called convert_grade that takes a letter grade 
#   and converts it into a standardized score between -1 and 1. The function checks if the input is a string. If it matches one 
#   of the valid letter grades, it returns the corresponding score. 
def convert_grade(grade):
    if isinstance(grade, str):
        if grade in ['F', 'D-', 'D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+']:
            return (['F', 'D-', 'D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'].index(grade) - 6) / 5
    return grade
# apply the new convert_grade function to the column. 
merged_df['Reviewer_Rating_Actual'] = merged_df['Reviewer_Rating_Actual'].apply(convert_grade)

In [None]:
#after imputing these cases to the right letters, we will convert everything to standard
#finding the number of cases that the score was a "A minus" etc. 
non_numeric_counts = merged_df['Reviewer_Rating_Actual'].str.extractall('(\D+)')[0].value_counts()
print(non_numeric_counts)

In [None]:
#these are the scores that have 5 or more cases
#going to impute them to letter score
    #B-plus          51
    #B-minus         41
    #C-plus          32
    #A-minus         15
    #B               12 - spacing issue i believe
    #A minus         10
    #C-minus          8
    #B plus           6
    #B +              5
    #C                5 

merged_df['Reviewer_Rating_Actual'] = merged_df['Reviewer_Rating_Actual'].replace(to_replace=['B-plus', 'B plus', 'B +'], value='B+')
merged_df['Reviewer_Rating_Actual'] = merged_df['Reviewer_Rating_Actual'].replace(to_replace=['B-minus', 'C-plus', 'A-minus', 'B ', 'A minus', 'C-minus', 'C ']
                                                                                  , value=['B-', 'C+', 'A-', 'B', 'A-', 'C-','C'])

In [None]:
# convert scales to standardized score: The code below creates another function called convert_scale.the scores like 4/5 and 8/10 in the data 
#   and converts them to the standarized scale. The function checks if the input is a string and if it contains a '/'character, which indicates 
#   the string is actually a ratio. the function splits the string into two parts, converts each part to a float, and then gets the ratio of those floats. 
#   the ratio is converted to a standardized score between -1 and 1 
def convert_scale(score):
    if type(score) == str and '/' in score:
        parts = score.split('/')
        if len(parts) == 2:
            num = float(parts[0])
            denom = float(parts[1])
            if denom != 0:
                return (num / denom) * 2 - 1
    return score
# the convert_score function is applied to the actual variable. 
merged_df['Reviewer_Rating_Actual'] = merged_df['Reviewer_Rating_Actual'].apply(convert_scale)
merged_df.head()

In [None]:
# convert dates to standardized score: the code below creates a function that can deal with the cell values that 
#  are recognized as dates. if the cell is a string and has two '/' then it's split into two parts. Its calculated as a ratio, and then standarized. 
# If the denominator is a 0, it returns Not available. 
def convert_date_score(score):
    if isinstance(score, str) and score.count('/') > 1:
        score = score.split('/', maxsplit=2)[:2] # keep only first two elements
        numerator, denominator = map(float, score)
        if denominator != 0:
            score = (numerator/denominator)*2-1
        else:
            score = 'Not available'
    return score
# apply the function to the dates in the variable. 
merged_df['Reviewer_Rating_Actual'] = merged_df['Reviewer_Rating_Actual'].apply(convert_date_score)
merged_df

In [None]:
# rename to Reviewer_Rating
merged_df = merged_df.rename(columns={'Reviewer_Rating_Actual': 'Reviewer_Rating'})
# drop reviewer rating column becuase it is empty
merged_df.drop('reviewer rating', axis=1, inplace=True)
# coerce non numeric values that are left into NAs
merged_df['Reviewer_Rating'] = pd.to_numeric(merged_df['Reviewer_Rating'], errors='coerce')
# get rid of any values outside of -1 to 1 range
merged_df['Reviewer_Rating'] = merged_df['Reviewer_Rating'].astype(float)
merged_df['Reviewer_Rating'] = merged_df['Reviewer_Rating'].mask(~merged_df['Reviewer_Rating'].between(-1, 1), np.nan)
merged_df['Reviewer_Rating'] = merged_df['Reviewer_Rating'].round(3)
merged_df['Reviewer_Rating'].isna().sum()
#left with about 36,000 NAs

In [None]:
#the code below is for imputing NAs with the median. Hold off until phase 2, as missing reviewer data will be replaced with sentimate scores. 
# calculate the skew
#reviewer_rating_skew = skew(merged_df['Reviewer_Rating'].dropna())
#print(reviewer_rating_skew)
# skew is -0.54 so use median score to impute NAs
#median_reviewer_rating = merged_df['Reviewer_Rating'].median(skipna=True)
#merged_df['Reviewer_Rating'] = merged_df['Reviewer_Rating'].fillna(median_reviewer_rating)
# round column to 2 decimals
#merged_df['Reviewer_Rating'] = merged_df['Reviewer_Rating'].round(2)
#merged_df.head(100)

In [None]:
# TMETER
# 0 NAs
# rename column
merged_df = merged_df.rename(columns={'tmeter': 'Tomato_Meter'})
# calculate the skew of the 'Tomato_Meter' column
tmeter_skew = skew(merged_df['Tomato_Meter'].dropna())
print(tmeter_skew)
# skew is -2.7 so use median to impute NA's
merged_df['Tomato_Meter'] = merged_df['Tomato_Meter'].replace(0, np.nan)
median_tomato_meter = merged_df['Tomato_Meter'].median(skipna=True)
merged_df['Tomato_Meter'] = merged_df['Tomato_Meter'].fillna(median_tomato_meter)
merged_df['Tomato_Meter'].head(100)

In [None]:
# REVIEW SRC URL and REVIEW OBJECT TYPE and REVIEW OBJECT HREF
# 0 NA for all three
# drop these columns because they do not provide value
merged_df = merged_df.drop(['review_src_url', 'review_object_type', 'review_object_href'], axis=1)

In [None]:
# PUBLICATION
# 0 NA
# rename column
merged_df = merged_df.rename(columns={'publication': 'Publication'})
# convert to string data type
merged_df['Publication'] = merged_df['Publication'].astype(str)
merged_df['Publication'].head(100)

In [None]:
# REVIEW OBJECT YEAR
# 0 NAs
# rename column
merged_df = merged_df.rename(columns={'review_object_year': 'Review_Object_Year'})
# convert to string data type
merged_df['Review_Object_Year'] = merged_df['Review_Object_Year'].astype(str)
merged_df['Review_Object_Year'].head(100)

In [None]:
# CRITIC NAME
# 0 NAs
# rename column
merged_df = merged_df.rename(columns={'critic_name': 'Critic_Name'})

In [None]:
# ROI
# create column to describe net profit as percentage
# ROI = ((Revenue - Budget) / Budget) * 100

merged_df['ROI'] = ((merged_df['Revenue'] - merged_df['Budget']) / merged_df['Budget']) * 100

In [None]:
# MONTH

# create a column to describe the month of movie release
merged_df['Release_Date'] = pd.to_datetime(merged_df['Release_Date'], errors='coerce')
# create a new column for the month name
merged_df['Month'] = merged_df['Release_Date'].dt.month_name()

In [None]:
# remove first row showing percentage NAs
merged_df = merged_df.drop(merged_df.index[0])

# WRITING THE DATAFRAME TO A NEW CSV FILE
merged_df.to_csv('merged_file.csv', index=False)

In [None]:
merged_df

In [None]:
# END OF PART 1
# START OF PART 2

In [None]:
df.columns

In [None]:
# Create a histogram of the 'Revenue' column
fig, ax = plt.subplots(figsize=(8,6))
ax.hist(df['Revenue'], bins=20, edgecolor='black')

# Customize x-axis labels
ax.set_xlabel('Revenue (in billions)')
ax.set_ylabel('Frequency')
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{x/1e9:g}'))
ax.set_xlim(0, 3e9)

plt.title('Histogram of Revenue')
plt.show()

In [None]:
df['Revenue'].plot(kind='box')
plt.ylabel('Revenue (in millions)')
plt.title('Boxplot of Revenue')
plt.show()

In [None]:
x = df['Budget']
y = df['Revenue']

plt.scatter(x, y, alpha=0.5)
plt.title('Revenue vs. Budget')
plt.xlabel('Budget (in billions)')
plt.ylabel('Revenue (in billions)')
plt.show()

In [None]:
#impute 
median_runtime = df['Runtime'].median()
df.loc[df['Runtime'] == 0, 'Runtime'] = median_runtime
df.loc[df['Runtime'] > 200, 'Runtime'] = median_runtime

In [None]:
x = df['Runtime']
y = df['Revenue']

plt.scatter(x, y, alpha=0.5)
plt.title('Revenue vs. Runtime')
plt.xlabel('Runtime')
plt.ylabel('Revenue in Billions')
plt.show()

In [None]:
x = df['Vote_Average']
y = df['Revenue']

plt.scatter(x, y, alpha=0.5)
plt.title('Revenue vs. Vote_Average')
plt.xlabel('Vote_Average')
plt.ylabel('Revenue in Billions')
plt.show()

In [None]:
x = df['Vote_Count']
y = df['Revenue']

plt.scatter(x, y, alpha=0.5)
plt.title('Revenue vs. Vote_Count')
plt.xlabel('Vote_Count')
plt.ylabel('Revenue in Billions')
plt.show()

In [None]:
monthly_revenue = df.groupby('Month')['Revenue'].mean()

# Create a new DataFrame with the monthly labels and average revenues
monthlyrevenue = pd.DataFrame({
    'Month': monthly_revenue.index,
    'Average Revenue': monthly_revenue.values
})

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Define default blue color instead of the green color map
colors = 'C0'  # 'C0' is the default blue color in Matplotlib

# Create an ordered categorical data type for the 'Month' column
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
               'August', 'September', 'October', 'November', 'December']
cat_dtype = pd.api.types.CategoricalDtype(categories=month_order, ordered=True)
monthlyrevenue['Month'] = monthlyrevenue['Month'].astype(cat_dtype)

# Sort the dataframe by the 'Month' column
monthlyrevenue = monthlyrevenue.sort_values('Month')

# Create a bar plot with monthly revenue and color the bars by revenue
plt.bar(monthlyrevenue['Month'], monthlyrevenue['Average Revenue'], color=colors)

# Customize the plot
plt.xlabel('Month')
plt.ylabel('Average Revenue ($)')
plt.title('Average Revenue by Month')
plt.xticks(rotation=45)

# Set the y-axis tick labels
tick_labels = ['${:,.0f}'.format(x) for x in plt.yticks()[0]]
plt.yticks(plt.yticks()[0], tick_labels)

# Display the plot
plt.show()


In [None]:
genre_revenue = df.groupby('Genre_1')['Revenue'].mean().reset_index()

# Create a bar plot with genre_1 and revenue
plt.bar(genre_revenue['Genre_1'], genre_revenue['Revenue'])
# Customize the plot
plt.xlabel('Genre')
plt.ylabel('Average Revenue (in Hundred Millions)')
plt.title('Average Revenue by Genre')
plt.xticks(rotation=45, ha='right')

# Display the plot
plt.show()

In [None]:
plt.hist(df['Vote_Average'], bins=20)
plt.title('Distribution of Vote Average')
plt.xlabel('Vote Average')
plt.ylabel('Frequency')
plt.show()

In [None]:
df['Vote_Average'].plot(kind='density')
plt.title('Distribution of Vote Average')
plt.xlabel('Vote Average')
plt.show()

In [None]:
x = df['Runtime']
y = df['Vote_Average']

plt.scatter(x, y, alpha=0.5)
plt.title('Vote Average vs. Runtime')
plt.xlabel('Runtime')
plt.ylabel('Vote Average')
plt.show()

In [None]:
x = df['Vote_Average']
y = df['Revenue']

plt.scatter(x, y, alpha=0.5)
plt.title('Revenue vs. Vote Average')
plt.xlabel('Vote_Average')
plt.ylabel('Revenue')
plt.show()

In [None]:
x = df['Vote_Average']
y = df['Vote_Count']

plt.scatter(x, y, alpha=0.5)
plt.title('Vote Count vs. Vote Average')
plt.xlabel('Vote Average')
plt.ylabel('Vote Count')
plt.show()

In [None]:
x = df['Vote_Average']
y = df['Reviewer_Rating']

plt.scatter(x, y, alpha=0.5)
plt.title('Reviewer Rating vs. Vote Average')
plt.xlabel('Vote Average')
plt.ylabel('Reviewer Rating')
plt.show()


In [None]:
df_filtered = df[df['Tomato_Meter'] >= 0]
# create the scatter plot using the filtered dataframe
x = df_filtered['Vote_Average']
y = df_filtered['Tomato_Meter']

plt.scatter(x, y, alpha=0.5)
plt.title('Tomato Meter vs. Vote Average')
plt.xlabel('Tomato_Meter')
plt.ylabel('Vote Average')
plt.show()

In [None]:
# Group the data by month and calculate the mean of the 'Vote Average' column
month_vote_avg = df.groupby('Month')['Vote_Average'].mean().reset_index()
plt.figure(figsize=(10, 5))
# Create a bar chart of vote average by month
plt.bar(month_vote_avg['Month'], month_vote_avg['Vote_Average'])
# Add data labels to the bars
for i, v in enumerate(month_vote_avg['Vote_Average']):
    plt.text(i, v, '{:.2f}'.format(v), ha='center', va='bottom')
# Customize the plot
plt.xlabel('Month')
plt.ylabel('Vote Average')
plt.title('Vote Average by Month')
plt.xticks(rotation=45)

# Display the plot
plt.show()

In [None]:
# Group the data by genre and calculate the mean of the 'Vote Average' column
genre_vote_avg = df.groupby('Genre_1')['Vote_Average'].mean().reset_index()
plt.figure(figsize=(10, 5))
# Create a bar chart of vote average by genre
plt.bar(genre_vote_avg['Genre_1'], genre_vote_avg['Vote_Average'])

# Add labels to the bars
for i, v in enumerate(genre_vote_avg['Vote_Average']):
    plt.text(i, v, '{:.2f}'.format(v), ha='center', va='bottom')

# Customize the plot
plt.xlabel('Genre')
plt.ylabel('Vote Average')
plt.title('Vote Average by Genre')
plt.xticks(rotation=60)

# Display the plot
plt.show()

In [None]:
plt.hist(df['Vote_Count'], bins=50)
plt.title('Distribution of Vote Count')
plt.xlabel('Vote Count')
plt.ylabel('Frequency')
plt.show()

In [None]:
x = df['Budget']
y = df['Vote_Count']

plt.scatter(x, y, alpha=0.5)
plt.title('Vote Count vs. Budget')
plt.xlabel('Budget (in billions)')
plt.ylabel('Vote Count')
plt.show()

In [None]:
x = df['Runtime']
y = df['Vote_Count']

plt.scatter(x, y, alpha=0.5)
plt.title('Vote Count vs. Runtime')
plt.xlabel('Runtime')
plt.ylabel('Vote Count')
plt.show()

In [None]:
x = df['Revenue']
y = df['Vote_Count']

plt.scatter(x, y, alpha=0.5)
plt.title('Vote Count vs. Revenue')
plt.xlabel('Revenue')
plt.ylabel('Vote Count')
plt.show()

In [None]:
x = df['Vote_Average']
y = df['Vote_Count']

plt.scatter(x, y, alpha=0.5)
plt.title('Vote Count vs. Vote Average')
plt.xlabel('Vote Average')
plt.ylabel('Vote Count')
plt.show()

In [None]:
x = df['Reviewer_Rating']
y = df['Vote_Count']

plt.scatter(x, y, alpha=0.5)
plt.title('Vote Count vs. Reviewer Rating')
plt.xlabel('Reviewer Rating')
plt.ylabel('Vote Count')
plt.show()

In [None]:
# Convert the 'Month' column to a categorical data type
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
               'August', 'September', 'October', 'November', 'December']
cat_dtype = pd.api.types.CategoricalDtype(categories=month_order, ordered=True)
df['Month'] = df['Month'].astype(cat_dtype)

# Group the data by month and calculate the sum of the 'Vote Count' column
monthly_vote_count = df.groupby('Month')['Vote_Count'].sum().reset_index()

# Create a bar chart of vote count by month
plt.bar(monthly_vote_count['Month'], monthly_vote_count['Vote_Count'])

# Customize the plot
plt.xlabel('Month')
plt.ylabel('Vote Count')
plt.title('Vote Count by Month')
plt.xticks(rotation=45)

# Display the plot
plt.show()


In [None]:
# Group the data by genre and calculate the sum of the 'Vote Count' column
genre_vote_count = df.groupby('Genre_1')['Vote_Count'].sum().reset_index()
plt.figure(figsize=(10, 6))
# Create a bar chart of vote count by genre
plt.bar(genre_vote_count['Genre_1'], genre_vote_count['Vote_Count'])

# Customize the plot
plt.xlabel('Genre')
plt.ylabel('Vote Count')
plt.title('Vote Count by Genre')
plt.xticks(rotation=90)

# Display the plot
plt.show()


In [None]:
df_cleaned = df[df['Release_Date'] != 'Not available']
print(df_cleaned['Release_Date'].dtype)
df_cleaned['Release_Month'] = pd.to_datetime(df_cleaned['Release_Date']).dt.month

df_cleaned_rating = df_cleaned.dropna(subset=['Reviewer_Rating'])
df_cleaned_month_rating = df_cleaned_rating.dropna(subset=['Release_Month'])

# Group df_cleaned by month and get mean of Reviewer_Rating 
month_rating = df_cleaned_month_rating.groupby(['Release_Month'])['Reviewer_Rating'].mean()
import calendar
month_abbr = list(calendar.month_abbr)[1:] # Define a list of month abbreviations


df_filtered = df_cleaned[df_cleaned['Reviewer_Rating'] > 0.6]
count_by_month = df_filtered['Release_Month'].value_counts()
count_by_month = count_by_month.sort_index()

# Define month labels
month_labels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

plt.bar(month_labels, count_by_month)
plt.title('Count of Movies with Reviewer_Rating > 0.6 by Release Month')
plt.xlabel('Month')
plt.ylabel('Count')
plt.show()

# September is traditionally known as the start of the "fall movie season," where studios release 
# prestige films in the lead up to award season. This is because many of the high-profile film festivals 
# (Venice Film Festival and the Toronto International Film Festival) take place in September. As a result, 
# studios may choose to release their films in September in order to coincide with these festivals and 
# generate buzz and positive reviews for their films. Additionally, the period from September to December 
# leads up to the award season, during which studios release their most highly regarded and critically 
# acclaimed films in the hopes of receiving nominations and awards. This background information could 
# explain the higher count of reviewer_ratings above .6 from September to December. 

In [None]:
# select only the numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64'])

# compute the correlation matrix
corr_matrix = numeric_cols.corr()

print(corr_matrix)

In [None]:
# Create a histogram of Revenue by Month
df.groupby('Month')['Revenue'].sum().plot(kind='bar', title='Revenue by Month', figsize=(8, 6))
plt.xlabel('Month')
plt.ylabel('Revenue')
plt.show()

In [None]:
# Create a histogram of the 'Budget' column
plt.figure(figsize=(8,6))
plt.title('Histogram of Budget')
plt.xlabel('Budget (in billions)')
plt.ylabel('Frequency')
plt.hist(df['Budget'], bins=20, edgecolor='black')

plt.show()

In [None]:
# Create a histogram of the 'Runtime' column
plt.figure(figsize=(8,6))
plt.title('Histogram of Runtime')
plt.xlabel('Runtime (in minutes)')
plt.ylabel('Frequency')
plt.hist(df['Runtime'], bins=20, edgecolor='black')

# Customize x-axis labels
plt.xticks(rotation=45, ha='right')
plt.ticklabel_format(style='plain', axis='x')

plt.show()

In [None]:
# END OF PART 2
# START OF PART 3

In [None]:
#Assignment requirements:
#1. Use either sentiment analyzer introduced in class to generate sentiment polarity scores for the content column in the dataframe.
#2. Impute NAs in the reviewer rating columns with the sentiment polarity scores 

In [None]:
#VADER ANALYSIS
analyzer = SentimentIntensityAnalyzer()
df.head() # The Content column contains the reviews that need to be analyzed.
#Content is what we will complete a sentiment analysis on. 

In [None]:
# Check the contents of the first review for an initial review 
text1 = df.iloc[0].Content
text1

In [None]:
# run the polarity scores on the 1st review to make sure the analyzer runs properly. 
analyzer.polarity_scores(text1) 
#{'neg': 0.0, 'neu': 0.873, 'pos': 0.127, 'compound': 0.4939}

In [None]:
# The following code saves the absolute compound polarity scores in the column Polarity_Score and the sentiment label (positive or negative) in a column called Sentiment 
compounds=[]
values=[]
for index, row in df.iterrows():
    text = row.Content
    scores = analyzer.polarity_scores(text)
    compounds.append(scores['compound'])
    if scores['compound']>0:
        values.append('POSITIVE')
    else:
        values.append('NEGATIVE')

df['Polarity_Score']=compounds
df['Polarity_Score']=df['Polarity_Score'].round(3)
df['Sentiment']=values

In [None]:
df.head(100)

In [None]:
#FLAIR
import flair
from flair.models import TextClassifier
from flair.data import Sentence
import pandas as pd
df1= pd.read_csv('merged_file.csv').head(100)

classifier = TextClassifier.load('en-sentiment')
sentence = Sentence('The food was mid.')
classifier.predict(sentence)
print(sentence)

In [None]:
#flair values
values2 = []
scores=[]
for index, row in df1.iterrows():
    text = Sentence(row.Content)
    classifier.predict(text)
    values2.append(text.labels[0].to_dict()['value'])
    scores.append(text.to_dict()['all labels'][0]['confidence'])

df1['Sentiment'] = values2
df1['Polarity'] = scores #Scores are absolute instead of [-1,1] from Vader
df1

In [None]:
#The Vader values are a little bit more accurate for the Content column versus the Flair style. 
#Example: 
#Row 4: The Platform is about as subtle as a punch in the face but that's by design. It's social commentary via blunt instrument using genre trappings and pitch-black satire as a club bashing at its targets with barely restrained glee.
#Row 98: Crow and his two fine Welsh-burred leads commit fully to the anguished nerve-fraying cause but their efforts can't conceal a certain thinness to the dramatic material...
#Row 100: Many audiences are going to be utterly shocked at the lengths the two leading actors Pattinson Dafoe are pushed. They have a loathsome dynamic. However those who can get on 
#  Eggers' level will be hypnotized by this eccentric experiment.

#The flair analysis designates these ratings as negative, but after reading the rating itself, I would consider these as more positive than negative. 
#Thus, we will use Vader as the sentiment analysis reasoning. 

In [None]:
#impute the NAs of Reviewer_Rating with the polarity score
df['Reviewer_Rating'] = df['Reviewer_Rating'].fillna(df.pop('Polarity_Score'))
#this code replaces the Reviewer_Rating with Polarity Score and deletes Polarity score column. Retain the sentiment column for now. 

In [None]:
#Written to CSV
df.to_csv('merged_file.csv', index=False)

In [None]:
# END OF PART 3
# START OF PART 4

In [None]:
# make a copy of the original DataFrame
df_copy = df.copy()

# create dummy variables for Month and Genre columns
dummy_month = pd.get_dummies(df_copy['Month'])
dummy_genre = pd.get_dummies(df_copy['Genre_1'])

# concatenate the dummy variables with the original DataFrame
df_copy = pd.concat([df_copy, dummy_month, dummy_genre], axis=1)

# remove the original Month and Genre columns
df_copy = df_copy.drop(['Month', 'Genre_1', 'Genre_2', 'Genre_1'], axis=1)

In [None]:
# remove categorical columns
df_copy = df_copy.select_dtypes(include='number')

# drop rows with missing or infinite values
df_copy = df_copy.replace([np.inf, -np.inf], np.nan).dropna()

In [None]:
# create independent variable matrix X and dependent variable vector y
X = df_copy[['Runtime', 'Popularity', 'Vote_Count',
       'Tomato_Meter', 'Reviewer_Rating', 
       'April', 'August', 'December', 'February', 'January', 'June',
       'March', 'May', 'November', 'October', 'September', 'Action',
       'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary',
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
       'Romance', 'Science Fiction', 'Thriller',
       'War', 'Western']] 
# Remove TV movie bc its not statistically significant
# Remove Drama bc it's correlation with Revenue is only -.15 AND it has a problematic VIF of 25.27
# Remove July bc its correlation is only .03 and is not statistically significant
# Budget had a correation of .79 but it has multicollinearity with vote_count. vote_count has a higher correlation at .82 so remove budget 
#remove vote_average bec of 33.3 multicollinearity 
y = df_copy['Revenue']
# add constant term to X matrix
X = sm.add_constant(X)
# fit OLS model
model1 = sm.OLS(y, X).fit()
# print summary of model
print(model1.summary())

In [None]:
# Check vifs
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
print(vif)

In [None]:
# Define predictor and dependent variables
X = df_copy[['Popularity', 'Vote_Count',
        'Tomato_Meter', 'Reviewer_Rating', 
        'April', 'August', 'December', 'February', 'January', 'June',
        'March', 'May', 'November', 'October', 'September', 'Action',
        'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary',
        'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
        'Romance', 'Science Fiction', 'Thriller', 'War', 'Western']] 

y = df['Revenue']

# Remove TV movie because it's not statistically significant
# Remove Drama because its correlation with Revenue is only -.15 AND it has a problematic VIF of 25.27
# Remove July because its correlation is only .03 and is not statistically significant
# Remove Budget because it has multicollinearity with Vote_Count. Vote_Count has a higher correlation at .82, so remove Budget 
# Remove Runtime and Vote_Average due to multicollinearity 

# Standardize predictor variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Add constant term to X matrix
X_scaled = sm.add_constant(X_scaled)

# Fit OLS model
model = sm.OLS(y, X_scaled).fit()

# Print summary of model results
print(model.summary())

In [None]:
# Check vifs
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
print(vif)

In [None]:
print(X.shape)
print(y.shape)
y.reset_index(drop=True, inplace=True)

In [None]:
# create independent variable matrix X and dependent variable vector y
X = df_copy[['Reviewer_Rating','Popularity', 'Budget',  'Vote_Count', 'Tomato_Meter',
       'April', 'August', 'December', 'February', 'January', 'July', 'June',
       'March', 'May', 'November', 'October', 'September', 'Adventure',
       'Animation', 'Comedy', 'Crime', 'Documentary', 'Action', 
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
       'Romance', 'Science Fiction', 'TV Movie', 'Thriller',
       'War', 'Western']]
# Removed vote_average. vif of 40
# Removed drama. vif of 37
# Removed runtime. vif of 14

y = df_copy['Revenue']

# Standardize predictor variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create a new DataFrame with standardized X and original column names
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Forward stepwise regression function
def forward_selection(X, y, threshold_in=0.01, verbose=True):
    included = []
    while True:
        changed = False
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(X[included+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))
        if not changed:
            break
    model = sm.OLS(y, sm.add_constant(X[included])).fit()
    return model

# Perform forward stepwise selection and print the model summary
model = forward_selection(X_scaled_df, y)
print(model.summary())


In [None]:
# check vifs 
vifs = pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
                 index=X.columns)
print("VIFs:")
print(vifs)

In [None]:
# create independent variable matrix X and dependent variable vector y
X = df_copy[['Budget', 'Runtime', 'Popularity', 'Revenue',
       'Vote_Count', 'Reviewer_Rating',
       'Tomato_Meter',
       'April', 'August', 'December', 'February', 'January', 'July', 'June',
       'March', 'May', 'November', 'October', 'September', 'Action',
       'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
       'Romance', 'Science Fiction', 'TV Movie', 'Thriller',
       'War', 'Western']]

y = df_copy['Vote_Average']

# add constant term to X matrix
X = sm.add_constant(X)

# fit OLS model
model2 = sm.OLS(y, X).fit()

# print summary of model
print(model2.summary())

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vifs = pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
                 index=X.columns)
print("VIFs:")
print(vifs)

In [None]:
print(X.shape)
print(y.shape)
y.reset_index(drop=True, inplace=True)

In [None]:
# create independent variable matrix X and dependent variable vector y
X = df_copy[[  'Popularity', 'Budget', 'Vote_Count',
        'Reviewer_Rating', 'Tomato_Meter',
       'April', 'August', 'December', 'February', 'January', 'July', 'June',
       'March', 'May', 'November', 'October', 'September', 'Action',
       'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
       'Romance', 'Science Fiction', 'TV Movie', 'Thriller',
       'War', 'Western']]

#remove drama, vif is 36
#remove revenue to reduce vifs from 6 to under 5 for everything 
#runtime, vif 13

y = df_copy['Vote_Average']

# Standardize predictor variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create a new DataFrame with standardized X and original column names
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Forward stepwise regression function
def forward_selection(X, y, threshold_in=0.01, verbose=True):
    included = []
    while True:
        changed = False
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(X[included+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
        if not changed:
            break
    model = sm.OLS(y, sm.add_constant(X[included])).fit()
    return model

# Perform forward stepwise selection and print the model summary
model = forward_selection(X_scaled_df, y)
print(model.summary())


In [None]:
# check vifs
from statsmodels.stats.outliers_influence import variance_inflation_factor
vifs = pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
                 index=X.columns)
print("VIFs:")
print(vifs)

In [None]:
# create independent variable matrix X and dependent variable vector y
X = df_copy[['Budget', 'Runtime', 'Popularity', 'Revenue',
       'Vote_Average', 'Reviewer_Rating', 'Tomato_Meter',
       'April', 'August', 'December', 'February', 'January', 'July', 'June',
       'March', 'May', 'November', 'October', 'September', 'Action',
       'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
       'Romance', 'Science Fiction', 'TV Movie', 'Thriller',
       'War', 'Western']]
y = df_copy['Vote_Count']

# add constant term to X matrix
X = sm.add_constant(X)

# fit OLS model
model3 = sm.OLS(y, X).fit()

# print summary of model
print(model3.summary())

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vifs = pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
                 index=X.columns)
print("VIFs:")
print(vifs)

In [None]:
print(X.shape)
print(y.shape)
y.reset_index(drop=True, inplace=True)

In [None]:
# create independent variable matrix X and dependent variable vector y
X = df_copy[[ 'Popularity', 'Revenue',
        'Reviewer_Rating', 'Tomato_Meter',
       'April', 'August', 'December', 'February', 'January', 'July', 'June',
       'March', 'May', 'November', 'October', 'September', 'Action',
       'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
       'Romance', 'Science Fiction', 'TV Movie', 'Thriller',
       'War', 'Western']]
# remove vote_average, vif of 40
# remove drama, vif of 36
#remove budget, vif of 6
#remove runtime, vif 13
y = df_copy['Vote_Count']

# Standardize predictor variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create a new DataFrame with standardized X and original column names
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Forward stepwise regression function
def forward_selection(X, y, threshold_in=0.01, verbose=True):
    included = []
    while True:
        changed = False
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(X[included+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
        if not changed:
            break
    model = sm.OLS(y, sm.add_constant(X[included])).fit()
    return model

# Perform forward stepwise selection and print the model summary
model = forward_selection(X_scaled_df, y)
print(model.summary())


In [None]:
#check vifs 
from statsmodels.stats.outliers_influence import variance_inflation_factor
vifs = pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
                 index=X.columns)
print("VIFs:")
print(vifs)

In [None]:
# Calculate mean and standard deviation of 'Vote_Count'
mean_vote_count = df_copy['Vote_Count'].mean()
std_vote_count = df_copy['Vote_Count'].std()

# Display one standard deviation above and below the mean
print(f'Mean Vote_Count: {mean_vote_count:.2f}')
print(f'Standard Deviation of Vote_Count: {std_vote_count:.2f}')
print(f'One Standard Deviation Above Mean Vote_Count: {mean_vote_count + std_vote_count:.2f}')
print(f'One Standard Deviation Below Mean Vote_Count: {mean_vote_count - std_vote_count:.2f}')