In [None]:
import pandas as pd
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import statistics
import pandas as pd
from collections import defaultdict
from collections import Counter
import numpy as np
import plotly.express as px 
import plotly.graph_objects as go

In [None]:
df = pd.read_csv('C:/Users/sdole/PycharmProjects/Movie_Rating_Project/Original Dataset.csv')

In [None]:
# Keeps the relevant columns
print(f'cols before: {df.columns}')
df = df[['Title','Genre','Tags', 'Languages','Series or Movie','Runtime','Director','Writer','Actors','Release Date','Summary', 'IMDb Score']]
print(f'After changing columns: {df.columns}')

# Data sescription

### Deleting instances where target variable is null

In [None]:
# How many instances?
print(f'How many instances: {df.shape[0]}')

# How many nulls at 'IMDb Score'
print(f'How many nulls in IMDb Score col: {df['IMDb Score'].isnull().sum()}')

# Dropping rows with null value at IMDb Score col
df.dropna(subset=['IMDb Score'], inplace=True)
# number of instances after removing null value at IMDb Score col
print(f'How many instances after dropping nulls in IMDb Score col: {df.shape[0]}')

# Target Variable 

In [None]:
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x="IMDb Score", stat='probability', bins=87, kde=True)
plt.title("IMDb Score's Distribution")
#plt.savefig('IMDb Score Distribution.png', facecolor='white', edgecolor='white')
plt.show()

In [None]:
# Get unique values and sort them in ascending order
sorted_unique_scores = sorted(df['IMDb Score'].unique())
print(sorted_unique_scores)

In [None]:
# plot a table of: mean, median, mode, q1, q2, variance 
# Calculate descriptive statistics
mean = round(df['IMDb Score'].mean(),3)
median = df['IMDb Score'].median()
mode = df['IMDb Score'].mode().values[0] if not df['IMDb Score'].mode().empty else float('nan')
q1 = df['IMDb Score'].quantile(0.25)
q2 = df['IMDb Score'].quantile(0.75)
variance = round(df['IMDb Score'].var(),3)
standard_deviation = round(df['IMDb Score'].std(), 3)
# Prepare data for the table
descriptive_stats = pd.DataFrame({
    'Statistic': ['Mean', 'Median', 'Mode', 'Q1', 'Q3', 'Variance', 'Standard deviation'],
    'Value': [mean, median, mode, q1, q2, variance, standard_deviation]
    })

# Create a Plotly table
fig = go.Figure(data=[go.Table(
    header=dict(values=list(descriptive_stats.columns),
                fill_color= '#636EFA',
                # fill_color='paleturquoise',
                align='left',
                font=dict(color='black', size=15)),
    cells=dict(values=[descriptive_stats.Statistic, descriptive_stats.Value],
               fill_color='lavender',
               align='left',
               height=25,
               font=dict(color='black', size=14)))
])
fig.update_layout(template='plotly_white',  width=500)

config = {
    'toImageButtonOptions': {
        'filename': 'Descriptive Statistics - Rating',
    }
}

fig.show(config=config)


In [None]:
# Step 1: Identify duplicates based on 'Title', 'Release Date', and 'Series or Movie'
duplicates = df[df.duplicated(subset=['Title', 'Release Date', 'Series or Movie'], keep='first')]

# Step 2: Print the number of duplicate rows identified
num_duplicates = len(duplicates)
print(f'Number of duplicate rows identified: {num_duplicates}')

# Step 3: Drop duplicates, keeping only the first occurrence
df.drop_duplicates(subset=['Title', 'Release Date', 'Series or Movie'], keep='first', inplace=True)

# Step 4: Reset the index of the updated DataFrame
df.reset_index(drop=True, inplace=True)

# Step 5: Optional - Print the number of rows before and after dropping duplicates (if needed for verification)
data_len_after = len(df)
print(f'Total number of rows after dropping duplicates: {data_len_after}')

In [None]:
# validate there are no duplicates 
has_duplicates = df.duplicated(subset=['Title', 'Release Date', 'Series or Movie'], keep='first').any()

if has_duplicates: 
    print('there are duplicates')
else: 
    print('there are no duplicates')

# describe predictors 

In [None]:
df.columns

In [None]:
# create a list of dictionaries
column_info = []
total_rows = len(df)


for col in df.columns:
    col_name = col 
    col_nulls = df[col].isnull().sum()
    col_unique = df[col].nunique()
    col_null_percent = (col_nulls / total_rows) * 100 
    
    # adding the columns 
    column_info.append({
        'Column Name': col_name,
        'Unique Values': col_unique,
        'Null Percent': round(col_null_percent, 2) 
    })
    
column_info_df = pd.DataFrame(column_info)


# plot the table 
fig = go.Figure(data=[go.Table(
    header=dict(values=list(column_info_df.columns),
                fill_color= '#636EFA',
                align='left',
                font=dict(color='black', size=15)),
    cells=dict(values=[column_info_df['Column Name'], column_info_df['Unique Values'], column_info_df['Null Percent']],
               fill_color='lavender',
               align='left',
               height=25,
               font=dict(color='black', size=13)))
])

# Update layout
fig.update_layout(template='plotly_white', width=550, height= 550)

# Show the table
fig.show()

In [None]:
missing_values = df.isnull().sum()

# Plotting the missing values
fig, ax = plt.subplots(figsize=(14, 8))  # Increase the figure size to make it wider
fig.patch.set_facecolor('white')  # Set the figure background color to white
ax.set_facecolor('white')  # Set the axes background color to white

# Plot the bar graph
missing_values.plot(kind='bar', color='skyblue', ax=ax)
ax.set_title('Count of Missing Values in Each Column')
ax.set_xlabel('Columns')
ax.set_ylabel('Number of Missing Values')
ax.set_xticks(range(len(missing_values.index)))
ax.set_xticklabels(missing_values.index, rotation=45)

ax.grid(axis='y', linestyle='--', alpha=0.7)

# Save the plot as a PNG file
plt.savefig('Missing_Values_Distribution.png', bbox_inches='tight', facecolor=fig.get_facecolor())  # Save the plot with white background

# Display the plot
plt.show()

# Numeric

### Release Date

In [None]:
# ---------------------- # 
# Extract month and year # 
# ---------------------- # 

# convert to datetime type
df['Release Date'] = pd.to_datetime(df['Release Date'])

# extract month and year out of 'date'
df['released_day'] = df['Release Date'].dt.day
print(f'head day: {df['released_day'].head()}')

df['released_month'] = df['Release Date'].dt.month
print(f'head month: {df['released_month'].head()}')

df['released_year'] = df['Release Date'].dt.year
print(f'head year: {df['released_year'].head()}')


print(df['released_year'].dtype)
print(df['released_month'].dtype)
print(df['released_day'].dtype)

# convert to int 
df['released_day'] = df['released_day'].astype('Int64')
df['released_month'] = df['released_month'].astype('Int64')
df['released_year'] = df['released_year'].astype('Int64')


print(df['released_year'].dtype)
print(df['released_month'].dtype)
print(df['released_day'].dtype)

In [None]:
print(sorted(df['released_year'].unique().dropna()))

In [None]:
# Plot avg rating by month 
monthly_data = df.groupby('released_month')['IMDb Score'].mean().reset_index()
fig = px.bar(monthly_data, x='released_month', y='IMDb Score', text='IMDb Score', title='Average Rating by Released Month')
fig.update_traces(
    textposition= 'outside',
    texttemplate='%{text:.2f}',
    textfont_size= 16,
    marker_color= '#636EFA',
)
unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
    xaxis_title = 'Released Month',
    yaxis_title = 'Avg IMDb Score',
    title_x=0.5,
    title_y=0.85,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22, family='Arial Black'),
    width = 650,
    height = 400,
    xaxis=dict(tickmode='linear', 
               tickfont=dict(size=14)),
    yaxis=dict(tickmode='linear',
               tickvals=unique_scores,
               range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
               tickfont=dict(size=14)
               ),
    template='plotly_white',
)


config = {
    'toImageButtonOptions': {
        'filename': 'Rating and Month'
    }
}

fig.show(config=config)

In [None]:
# Plot avg rating by year
yearly_data = df.groupby('released_year')['IMDb Score'].mean().reset_index()
# keeps only the top 15 
yearly_data = yearly_data.sort_values(by='released_year', ascending=False).head(15)

fig = px.bar(yearly_data, x='released_year', y='IMDb Score', text='IMDb Score', title='Average Rating by Released Year')
fig.update_traces(
    textposition= 'outside',
    texttemplate='%{text:.2f}',
    textfont_size= 16,
    marker_color= '#636EFA',
)
unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
    xaxis_title = 'Released Year',
    yaxis_title = 'Avg IMDb Score',
    title_x=0.5,
    title_y=0.85,
    width = 650,
    height = 400,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22, family='Arial Black'),
    xaxis=dict(tickmode='linear', 
               tickfont=dict(size=14),
               tickangle=45
    ),
    yaxis=dict(tickmode='linear',
               tickvals=unique_scores,
               range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
               tickfont=dict(size=14)
               ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': 'Rating and Year'
    }
}

fig.show(config=config)



In [None]:
sorted_unique_years = sorted(df['released_year'].dropna().unique())
print(sorted_unique_years)

In [None]:
# maybe by day? 
daily_data = df.groupby('released_day')['IMDb Score'].mean().reset_index()
# keeps only the top 15 
# yearly_data = yearly_data.sort_values(by='released_day', ascending=False).head(15)

fig = px.bar(daily_data, x='released_day', y='IMDb Score', text='IMDb Score', title='Average Rating by Released Day')
fig.update_traces(
    textposition= 'outside',
    texttemplate='%{text:.2f}',
    textfont_size= 16,
    marker_color= '#636EFA',
)
unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
    xaxis_title = 'Released Day',
    yaxis_title = 'Avg IMDb Score',
    title_x=0.5,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22),
    xaxis=dict(tickmode='linear', 
               tickfont=dict(size=14)),
    yaxis=dict(tickmode='linear',
               tickvals=unique_scores,
               range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
               tickfont=dict(size=14)
               ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': 'Rating and Day'
    }
}

fig.show(config=config)



In [None]:
fig = px.scatter(df, x='released_year', y='IMDb Score', color='released_month', title="Interactive Scatter Plot of Month and Day")
fig.show()

In [None]:
df.columns

In [None]:
# TODO: Show interaction between month and year, with the last 5 years, bar height will be mean rating per month+year
# TODO show the interaction between month and day, for example plot a scatter plot with 12 difference collors for each month, X axis will be day, y axis will be mean rating pre month+day

### Runtime

In [None]:
# Drop rows where 'Runtime' or 'IMDb Score' is NaN
df_clean = df.dropna(subset=['Runtime', 'IMDb Score'])

In [None]:
# Grouping data by Runtime
runtime_data = df.groupby('Runtime')['IMDb Score'].mean().reset_index()

# Creating the bar plot
fig = px.bar(runtime_data, x='Runtime', y='IMDb Score', text='IMDb Score', title='Average Rating by Runtime')

# Updating traces
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}',
    textfont_size=16,
    marker_color='#636EFA',
)

# Unique scores and layout update
unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
    xaxis_title='Runtime',
    yaxis_title='Avg IMDb Score',
    title_x=0.5,
    title_y=0.85,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22, family='Arial Black'),
    width=650,
    height=400,
    xaxis=dict(
        tickmode='linear',
        tickfont=dict(size=14),
        tickangle=45  # Rotate the x-axis labels by 45 degrees if needed
    ),
    yaxis=dict(
        tickmode='linear',
        tickvals=unique_scores,
        range=[min(unique_scores) - 0.5, max(unique_scores) + 0.5],
        tickfont=dict(size=14)
    ),
    template='plotly_white',
)

# Configurations for exporting the plot
config = {
    'toImageButtonOptions': {
        'filename': 'Rating and Runtime'
    }
}

# Display the plot
fig.show(config=config)

In [None]:
# Grouping data by both Runtime and Series of Movie
runtime_series_data = df.groupby(['Runtime', 'Series or Movie'])['IMDb Score'].mean().reset_index()

# Creating the bar plot
fig = px.bar(runtime_series_data, x='Runtime', y='IMDb Score', color='Series or Movie', 
             text='IMDb Score', title='Average Rating by Runtime and Series or Movie')

# Updating traces
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}',
    textfont_size=16,
)

# Unique scores and layout update
unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
    xaxis_title='Runtime',
    yaxis_title='Avg IMDb Score',
    title_x=0.5,
    title_y=0.85,
    barmode='group',  # Place bars next to each other instead of stacking
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22, family='Arial Black'),
    width=650,
    height=500,
    xaxis=dict(
        tickmode='linear',
        tickfont=dict(size=14),
        tickangle=45  # Rotate the x-axis labels by 45 degrees if needed
    ),
    yaxis=dict(
        tickmode='linear',
        tickvals=unique_scores,
        range=[min(unique_scores) - 0.5, max(unique_scores) + 0.5],
        tickfont=dict(size=14)
    ),
    template='plotly_white',
)

# Configurations for exporting the plot
config = {
    'toImageButtonOptions': {
        'filename': 'Rating_Runtime_Series'
    }
}

# Display the plot
fig.show(config=config)


### Series or Movie

In [None]:
df.columns

In [None]:
# Avg Rating by Film Type
film_type_mean_rating = df.groupby('Series or Movie')['IMDb Score'].mean().reset_index()
# keeps only the top 15 
# yearly_data = yearly_data.sort_values(by='released_day', ascending=False).head(15)

fig = px.bar(film_type_mean_rating, x='Series or Movie', y='IMDb Score', text='IMDb Score', title='Average Rating by Film Type')
fig.update_traces(
    textposition= 'outside',
    texttemplate='%{text:.2f}',
    textfont_size= 16,
    marker_color= '#636EFA',
)
unique_scores = sorted(df['IMDb Score'].unique())
fig.update_layout(
    xaxis_title = 'Film Type',
    yaxis_title = 'Avg IMDb Score',
    title_x=0.5,
    title_y=0.85,
    width = 550,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22, family='Arial Black'),
    xaxis=dict(tickmode='linear', 
               tickfont=dict(size=14)),
    yaxis=dict(tickmode='linear',
               tickvals=unique_scores,
               range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
               tickfont=dict(size=14)
               ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': 'Rating and Film Type'
    }
}

fig.show(config=config)



In [None]:
sns.set_theme(style="white")
sns.kdeplot(data=df, x="IMDb Score", hue="Series or Movie", fill=True, alpha=0.4, linewidth=1.5)
# Add a title and labels to the plot using Matplotlib
plt.title("Rating Distribution by Film Type")
plt.xlabel("")
plt.ylabel("Density")
plt.savefig("Rating Distribution by Film Type.png", dpi=300)
plt.show()

In [None]:
# Box plot
fig = px.box(df, x="Series or Movie", y="IMDb Score", title="Box Plot of IMDb Ratings by Film's Type")
fig.update_layout(
        template='plotly_white',
        xaxis_title = "Film's Type",
        yaxis_title = 'Avg IMDb Score',
        title_x=0.5,
        title_y=0.85,
        width=550,
        xaxis_title_font=dict(size=17),
        yaxis_title_font=dict(size=17),
        title_font=dict(size=22),
        xaxis=dict(tickfont=dict(size=14)),
        yaxis=dict(tickfont=dict(size=14)),
)
config = {
    'toImageButtonOptions': {
        'filename': 'Box Plot - Rating and Film type'
    }
}

fig.show(config=config)

### Runtime

In [None]:
# Box plot
fig = px.box(df, x="Runtime", y="IMDb Score", title="Box Plot of IMDb Ratings by Runtime")
fig.update_layout(
        template='plotly_white',
        xaxis_title = "Film's Runtime",
        yaxis_title = 'Avg IMDb Score',
        title_x=0.5,
        title_y=0.85,
        width=550,
        xaxis_title_font=dict(size=17),
        yaxis_title_font=dict(size=17),
        title_font=dict(size=22),
        xaxis=dict(tickfont=dict(size=14)),
        yaxis=dict(tickfont=dict(size=14)),
)
config = {
    'toImageButtonOptions': {
        'filename': 'Box Plot - Rating and Runtime'
    }
}

fig.show(config=config)

### 

## Genre

### Plot 1: Most Frequent Genres and Avg Ranting (After Sorting Genre)

In [None]:
genre_data = df.copy()
genre_data.dropna(subset=['Genre'], inplace=True)

In [None]:
genre_data['Genre']

In [None]:
# takes a colum: lower each word, strip from spaces, sorting the word alphabetically and returning the text processed. 
def process_col(col):
    if pd.isna(col):
        return col
    col = [word.lower().strip() for word in col.split(',')]
    return ', '.join(sorted(col))


In [None]:
genre_data['sorted_genres'] = genre_data['Genre'].apply(process_col)

In [None]:
# calculate the percentage of each value 
genre_frequency = genre_data['sorted_genres'].value_counts() / len(genre_data) * 100 
# convert to dataframe
genre_frequency = genre_frequency.reset_index()
# change the name of columns 
print(genre_frequency.columns)
genre_frequency.columns = ['Genre', 'Percentage']
print(genre_frequency.columns)
genre_percentage = genre_frequency.sort_values(by='Percentage', ascending=False)
# save the top 20 
top_20_genres = genre_frequency.head(20)

In [None]:
# Plotting the histogram
fig = px.histogram(top_20_genres, x='Genre', y='Percentage', text_auto=True, title='20 Most Frequent Genre Combinations')
fig.update_traces(
    texttemplate='%{y:.2f}%',
    textposition='outside',
    marker_color='#636EFA',
)
fig.update_layout(
    xaxis_title='Genres',
    yaxis_title='Percentage',
    title_x=0.5,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22),
    xaxis=dict(
        tickmode='linear', 
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        tickformat=".2f%%",
        tickfont=dict(size=14),
        range=[0, 100]
    ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'
    }
}
fig.show(config=config)

In [None]:
# Plotting the bar plot
fig = px.bar(top_20_genres, x='Genre', y='Percentage', text='Percentage', title='20 Most Frequent Genre Combinations')
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}%',
    textfont_size=16,
    marker_color='#636EFA',
)
fig.update_layout(
    xaxis_title = 'Genres',
    yaxis_title = 'Percentage',
    title_x=0.5,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22),
    xaxis=dict(tickmode='linear', 
               tickfont=dict(size=14)),
    yaxis=dict(
        tickmode='linear',
        tickfont=dict(size=14),
        range=[0, 100]
    ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'
    }
}

fig.show(config=config)

In [None]:
# avg rating by most frequent genres 
genre_mean_imdb = genre_data.groupby('sorted_genres')['IMDb Score'].mean().reset_index()
print(genre_mean_imdb.columns)
genre_mean_imdb.columns = ['Genre', 'Avg IMDb Score']

genre_stats = pd.merge(genre_percentage, genre_mean_imdb, on='Genre')
top_30_genres = genre_stats.head(20)

In [None]:
# Plotting the bar plot
fig = px.bar(top_30_genres, x='Genre', y='Avg IMDb Score', text='Avg IMDb Score', title='20 Most Frequent Genre Combinations and Avg Rating')
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}%',
    textfont_size=16,
    marker_color='#636EFA',
)
fig.update_layout(
    xaxis_title = 'Genres',
    yaxis_title = 'Avg IMDb Score',
    title_x=0.5,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22),
    xaxis=dict(tickmode='linear', 
               tickfont=dict(size=14)),
    yaxis=dict(tickmode='linear',
               tickvals=unique_scores,
               range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
               tickfont=dict(size=14)
               ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'
    }
}

fig.show(config=config)

### Plot 2: Avg Rating and most Frequent Genres (Without Sorting)

In [None]:
genre_data = df.copy()
genre_data.dropna(subset=['Genre'], inplace=True)

In [None]:
# takes a colum: lower each word, strip from spaces, sorting the word alphabetically and returning the text processed. 
def process_col_without_sorting(col):
    if pd.isna(col):
        return col
    col = [word.lower().strip() for word in col.split(',')]
    return ', '.join(col)

In [None]:
genre_data['Genre'] = genre_data['Genre'].apply(process_col_without_sorting)

In [None]:
# calculate the percentage of each value 
genre_frequency = genre_data['Genre'].value_counts() / len(genre_data) * 100 
# convert to dataframe
genre_frequency = genre_frequency.reset_index()
# change the name of columns 
print(genre_frequency.columns)
genre_frequency.columns = ['Genre', 'Percentage']
print(genre_frequency.columns)
genre_percentage = genre_frequency.sort_values(by='Percentage', ascending=False)
# save the top 20 
top_20_genres = genre_frequency.head(20)

In [None]:
# Most Frequent Genres without sorting 
fig = px.bar(top_20_genres, x='Genre', y='Percentage', text='Percentage', title='20 Most Frequent Genre Combinations')
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}%',
    textfont_size=16,
    marker_color='#636EFA',
)
fig.update_layout(
    xaxis_title = 'Genres',
    yaxis_title = 'Percentage',
    title_x=0.5,
    title_y=0.85,
    # width=850,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22),
    xaxis=dict(tickmode='linear', 
               tickfont=dict(size=14)),
    yaxis=dict(tickmode='linear',
               tickvals=unique_scores,
               range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
               tickfont=dict(size=14)
               ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'
    }
}

fig.show(config=config)

In [None]:
# Plotting Avg Rating by Genre (Without sorting genre) 
genre_mean_imdb = genre_data.groupby('Genre')['IMDb Score'].mean().reset_index()
print(genre_mean_imdb.columns)
genre_mean_imdb.columns = ['Genre', 'Avg IMDb Score']

genre_stats = pd.merge(genre_percentage, genre_mean_imdb, on='Genre')
top_30_genres = genre_stats.head(20)

In [None]:
# Plotting the bar plot
fig = px.bar(top_30_genres, x='Genre', y='Avg IMDb Score', text='Avg IMDb Score', title='20 Most Frequent Genre Combinations and Avg Rating')
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}',
    textfont_size=16,
    marker_color='#636EFA',
)
fig.update_layout(
    xaxis_title = 'Genres',
    yaxis_title = 'Avg IMDb Score',
    title_x=0.5,
    title_y=0.85,
    width=1000,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22, family='Arial Black'),
    xaxis=dict(tickmode='linear', 
               tickfont=dict(size=14)),
    yaxis=dict(tickmode='linear',
               tickvals=unique_scores,
               range=[min(unique_scores)-0.5, max(unique_scores)+0.5],
               tickfont=dict(size=14)
               ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'
    }
}

fig.show(config=config)

### Plot 3: Genres Frequency (Only First Genre)

In [None]:
# copy to a new dataframe, delete nulls, and process text 
genre_data = df.copy()
genre_data.dropna(subset=['Genre'], inplace=True)
genre_data['Genre'] = genre_data['Genre'].apply(process_col_without_sorting)

In [None]:
# taking only the first word 
def save_first_val(col):
    col = col.split(',')
    return col[0]

genre_data['Genre'] = genre_data['Genre'].apply(save_first_val)


# calculate the percentage of each value 
genre_frequency = genre_data['Genre'].value_counts() / len(genre_data) * 100 
# convert to dataframe
genre_frequency = genre_frequency.reset_index()
# change the name of columns 
print(genre_frequency.columns)
genre_frequency.columns = ['Genre', 'Percentage']
print(genre_frequency.columns)
genre_percentage = genre_frequency.sort_values(by='Percentage', ascending=False)
# save the top 20 
# top_20_genres = genre_frequency.head(20)

In [None]:
# Plotting the histogram
fig = px.histogram(genre_percentage, x='Genre', y='Percentage', text_auto = True, title='Most Frequent Genre Combinations')
fig.update_traces(
    texttemplate='%{y:.2f}%',
    textfont=dict(size=50),
    textposition='outside',
    marker_color='#636EFA',
    insidetextfont=dict(size=30),
    outsidetextfont=dict(size=30),
)
fig.update_layout(
    xaxis_title='Genres',
    yaxis_title='Percentage',
    title_x=0.5,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22),
    xaxis=dict(
        tickmode='linear', 
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        tickformat=".2f%%",
        tickfont=dict(size=14),
    ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': 'Most frequent Genre Combinations'
    }
}
fig.show(config=config)

In [None]:
# Most Frequent Genres without sorting 
fig = px.bar(genre_percentage, x='Genre', y='Percentage', text='Percentage', title='Genre frequencies (showed as percentage)')
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}',
    textfont_size= 30,
    marker_color='#636EFA',
)

fig.update_layout(
    height = 600,
    xaxis_title = 'Genres',
    yaxis_title = 'Percentage',
    title_x=0.5,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22),
    xaxis=dict(tickmode='linear', 
               tickfont=dict(size=14)),
    yaxis=dict(tickmode='linear',
               tickfont=dict(size=14)
               ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'
    }
}

fig.show(config=config)

### Plot 4: Avg Rating by Genre (Only First Genre)

In [None]:
# Avg rating by genre table
genre_mean_imdb = genre_data.groupby('Genre')['IMDb Score'].mean().reset_index()
print(genre_mean_imdb.columns)
genre_mean_imdb.columns = ['Genre', 'Avg IMDb Score']
genre_stats = pd.merge(genre_percentage, genre_mean_imdb, on='Genre')

In [None]:
# Plotting the bar plot for top 30 genres with Avg IMDb Score
fig = px.bar(genre_stats, x='Genre', y='Avg IMDb Score', text='Avg IMDb Score', title='Avg Rating by Genre')
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}%',  
    textfont_size=16,             
    marker_color='#636EFA',       
)
fig.update_layout(
    xaxis_title='Genres',          
    yaxis_title='Avg IMDb Score',  
    title_x=0.5,   
    title_y=0.85,
    xaxis_title_font=dict(size=17), 
    yaxis_title_font=dict(size=17),  
    title_font=dict(size=22, family='Arial Black'),        
    xaxis=dict(
        tickmode='linear', 
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        tickmode='linear',
        tickvals=unique_scores,   
        range=[min(unique_scores)-0.5, max(unique_scores)+0.5], 
        tickfont=dict(size=14)
    ),
    template='plotly_white',       
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'  
    }
}

fig.show(config=config)


## Director

In [None]:
# copy to a new dataframe, delete nulls, and process text 
director_data = df.copy()
director_data.dropna(subset=['Director'], inplace=True)
print(director_data['Director'].head(15))

In [None]:
# 1. pre-process category by keeping the first value in each row
def pre_process_category(value):
    # if the value is not nun or none then do that. else, skip 
    if isinstance(value, str):
        # lower 
        value.lower()
        # convert into a list of writers
        list_val = value.split(',')
        return list_val[0].strip()
    else: 
        return value
    
director_data['Director'] = director_data['Director'].apply(pre_process_category)
print(director_data['Director'].head(15))

In [None]:
# plot director
# calculate the percentage of each value 
director_data_freq = director_data['Director'].value_counts() / len(director_data) * 100 
# convert to dataframe
director_data_freq = director_data_freq.reset_index()
# change the name of columns 
print(director_data_freq.columns)
director_data_freq.columns = ['Director', 'Percentage']
print(director_data_freq.columns)
director_data_percentage = director_data_freq.sort_values(by='Percentage', ascending=False)
# save the top 20 
# top_20_genres = genre_frequency.head(20)
# Save the top 50 directors
top_50_directors = director_data_percentage.head(50)

In [None]:
# Plotting the histogram
fig = px.histogram(top_50_directors, x='Director', y='Percentage', text_auto = True, title='Most Frequent Directors Combinations')
fig.update_traces(
    texttemplate='%{y:.2f}%',
    textfont=dict(size=50),
    textposition='outside',
    marker_color='#636EFA',
    insidetextfont=dict(size=30),
    outsidetextfont=dict(size=30),
)
fig.update_layout(
    xaxis_title='Director',
    yaxis_title='Percentage',
    title_x=0.5,
    xaxis_title_font=dict(size=17),
    yaxis_title_font=dict(size=17),
    title_font=dict(size=22),
    xaxis=dict(
        tickmode='linear', 
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        tickformat=".2f%%",
        tickfont=dict(size=14),
    ),
    template='plotly_white',
)

config = {
    'toImageButtonOptions': {
        'filename': 'Most frequent Genre Combinations'
    }
}
fig.show(config=config)

### Director: plotting avg rating by director

In [None]:
# Avg rating by genre table
director_mean_imdb = director_data.groupby('Director')['IMDb Score'].mean().reset_index()
print(director_mean_imdb.columns)
director_mean_imdb.columns = ['Director', 'Avg IMDb Score']
director_stats = pd.merge(top_50_directors, director_mean_imdb, on='Director')

In [None]:
# Plotting the bar plot for top 30 genres with Avg IMDb Score
fig = px.bar(director_stats, x='Director', y='Avg IMDb Score', text='Avg IMDb Score', title='Avg Rating by Director')
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}%',  
    textfont_size=16,             
    marker_color='#636EFA',       
)
fig.update_layout(
    xaxis_title='Director',          
    yaxis_title='Avg IMDb Score',  
    title_x=0.5,   
    title_y=0.85,
    width=650,
    xaxis_title_font=dict(size=17), 
    yaxis_title_font=dict(size=17),  
    title_font=dict(size=22, family='Arial Black'),        
    xaxis=dict(
        tickmode='linear', 
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        tickmode='linear',
        tickvals=unique_scores,   
        range=[min(unique_scores)-0.5, max(unique_scores)+0.5], 
        tickfont=dict(size=14)
    ),
    template='plotly_white',       
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'  
    }
}

fig.show(config=config)


## Writer 

In [None]:
# copy to a new dataframe, delete nulls, and process text 
writer_data = df.copy()
writer_data.dropna(subset=['Writer'], inplace=True)
print(writer_data['Writer'].head(15))
writer_data['Writer'] = writer_data['Writer'].apply(pre_process_category)
print(writer_data['Writer'].head(15))

In [None]:
# plot director
# calculate the percentage of each value 
writer_data_freq = writer_data['Writer'].value_counts() / len(director_data) * 100 
# convert to dataframe
writer_data_freq = writer_data_freq.reset_index()
# change the name of columns 
print(writer_data_freq.columns)
writer_data_freq.columns = ['Writer', 'Percentage']
print(writer_data_freq.columns)
writer_data_percentage = writer_data_freq.sort_values(by='Percentage', ascending=False)
# save the top 20 
# top_20_genres = genre_frequency.head(20)
# Save the top 50 directors
top_50_writers = writer_data_percentage.head(50)

# Avg rating by genre table
writer_mean_imdb = writer_data.groupby('Writer')['IMDb Score'].mean().reset_index()
print(writer_mean_imdb.columns)
writer_mean_imdb.columns = ['Writer', 'Avg IMDb Score']
writer_stats = pd.merge(top_50_writers, writer_mean_imdb, on='Writer')

In [None]:
# Plotting the bar plot for top 30 genres with Avg IMDb Score
fig = px.bar(writer_stats, x='Writer', y='Avg IMDb Score', text='Avg IMDb Score', title='Avg Rating by Writer')
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}%',  
    textfont_size=16,             
    marker_color='#636EFA',       
)
fig.update_layout(
    xaxis_title='Writer',          
    yaxis_title='Avg IMDb Score',  
    title_x=0.5,   
    title_y=0.85,
    width=650,
    xaxis_title_font=dict(size=17), 
    yaxis_title_font=dict(size=17),  
    title_font=dict(size=22, family='Arial Black'),        
    xaxis=dict(
        tickmode='linear', 
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        tickmode='linear',
        tickvals=unique_scores,   
        range=[min(unique_scores)-0.5, max(unique_scores)+0.5], 
        tickfont=dict(size=14)
    ),
    template='plotly_white',       
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'  
    }
}

fig.show(config=config)


## Actors

In [None]:
df.columns

In [None]:
# copy to a new dataframe, delete nulls, and process text 
actor_data = df.copy()
actor_data.dropna(subset=['Actors'], inplace=True)
print(actor_data['Actors'].head(15))
actor_data['Actors'] = actor_data['Actors'].apply(pre_process_category)
print(actor_data['Actors'].head(15))

In [None]:
# plot director
# calculate the percentage of each value 
actor_data_freq = actor_data['Actors'].value_counts() / len(director_data) * 100 
# convert to dataframe
actor_data_freq = actor_data_freq.reset_index()
# change the name of columns 
print(actor_data_freq.columns)
actor_data_freq.columns = ['Actors', 'Percentage']
print(actor_data_freq.columns)
actor_data_percentage = actor_data_freq.sort_values(by='Percentage', ascending=False)
# save the top 20 
# top_20_genres = genre_frequency.head(20)
# Save the top 50 directors
top_50_actors = actor_data_percentage.head(50)

# Avg rating by genre table
actor_mean_imdb = actor_data.groupby('Actors')['IMDb Score'].mean().reset_index()
print(actor_mean_imdb.columns)
actor_mean_imdb.columns = ['Actors', 'Avg IMDb Score']
actor_stats = pd.merge(top_50_actors, actor_mean_imdb, on='Actors')

In [None]:
# Plotting the bar plot for top 30 genres with Avg IMDb Score
fig = px.bar(actor_stats, x='Actors', y='Avg IMDb Score', text='Avg IMDb Score', title='Avg Rating by Actor')
fig.update_traces(
    textposition='outside',
    texttemplate='%{text:.2f}%',  
    textfont_size=16,             
    marker_color='#636EFA',       
)
fig.update_layout(
    xaxis_title='Actors',          
    yaxis_title='Avg IMDb Score',  
    title_x=0.5,   
    title_y=0.85,
    width=650,
    xaxis_title_font=dict(size=17), 
    yaxis_title_font=dict(size=17),  
    title_font=dict(size=22, family='Arial Black'),        
    xaxis=dict(
        tickmode='linear', 
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        tickmode='linear',
        tickvals=unique_scores,   
        range=[min(unique_scores)-0.5, max(unique_scores)+0.5], 
        tickfont=dict(size=14)
    ),
    template='plotly_white',       
)

config = {
    'toImageButtonOptions': {
        'filename': '20 Most frequent Genre Combinations'  
    }
}

fig.show(config=config)
