In [2]:
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

from wordcloud import WordCloud
from itertools import chain
from collections import Counter

import statsmodels.api as stm
import statsmodels.formula.api as smf

import networkx as nx
from IPython.display import Image, display

from PIL import Image

In [3]:
def dict_to_list(data, column):
    # Filter the data where the specified column is not null
    filtered_data = data[data[column].notnull()]
    
    # Apply regex to extract specific patterns from the column values
    filtered_data[column] = filtered_data[column].apply(lambda x: re.findall(r'": "(.*?)"', x))
    
    return filtered_data

def extract_first_item(genre_list):
    # Return the first item (or None if the list is empty)
    if isinstance(genre_list, list) and len(genre_list) > 0:
        return genre_list[0]
    else:
        return None
    
def clean_genres(movies):
    for index, row in movies.iterrows():
        # Comedy genres
        if 'Comedy-drama' in row['Genres']:
            if 'Comedy' in row['Genres'] and 'Drama' in row['Genres']:
                row['Genres'].remove('Comedy-drama')
            elif 'Comedy' in row['Genres'] and 'Drama' not in row['Genres']:
                row['Genres'][row['Genres'].index('Comedy-drama')] = 'Drama'
            elif 'Drama' in row['Genres'] and 'Comedy' not in row['Genres']:
                row['Genres'][row['Genres'].index('Comedy-drama')] = 'Comedy'
            else:
                row['Genres'].remove('Comedy-drama')
                row['Genres'].extend(['Comedy', 'Drama'])

        # Romance genres
        if 'Romantic comedy' in row['Genres']:
            if 'Romance Film' in row['Genres'] and 'Comedy' in row['Genres']:
                row['Genres'].remove('Romantic comedy')
            elif 'Comedy' in row['Genres'] and 'Romance Film' not in row['Genres']:
                row['Genres'][row['Genres'].index('Romantic comedy')] = 'Romance Film'
            elif 'Romance Film' in row['Genres'] and 'Comedy' not in row['Genres']:
                row['Genres'][row['Genres'].index('Romantic comedy')] = 'Comedy'
            else:
                row['Genres'].remove('Romantic comedy')
                row['Genres'].extend(['Comedy', 'Romance Film'])

        # Drama genres
        if 'Romantic drama' in row['Genres']:
            if 'Romance Film' in row['Genres'] and 'Drama' in row['Genres']:
                row['Genres'].remove('Romantic drama')
            elif 'Drama' in row['Genres'] and 'Romance Film' not in row['Genres']:
                row['Genres'][row['Genres'].index('Romantic drama')] = 'Romance Film'
            elif 'Romance Film' in row['Genres'] and 'Drama' not in row['Genres']:
                row['Genres'][row['Genres'].index('Romantic drama')] = 'Drama'
            else:
                row['Genres'].remove('Romantic drama')
                row['Genres'].extend(['Drama', 'Romance Film'])

        # Action genres
        if 'Action/Adventure' in row['Genres']:
            if 'Action' in row['Genres'] and 'Adventure' in row['Genres']:
                row['Genres'].remove('Action/Adventure')
            elif 'Action' in row['Genres'] and 'Adventure' not in row['Genres']:
                row['Genres'][row['Genres'].index('Action/Adventure')] = 'Adventure'
            elif 'Adventure' in row['Genres'] and 'Action' not in row['Genres']:
                row['Genres'][row['Genres'].index('Action/Adventure')] = 'Action'
            else:
                row['Genres'].remove('Action/Adventure')
                row['Genres'].extend(['Action', 'Adventure'])

        # Redundant genres
        if 'Comedy film' in row['Genres']:
            if 'Comedy' in row['Genres']:
                row['Genres'].remove('Comedy film')
            else:
                row['Genres'].remove('Comedy film')
                row['Genres'].extend('Comedy')

    return movies

In [None]:
# Define a function to replace spaces with underscores in a list of strings
def replace_spaces_with_underscores(string_list):
    return [s.replace(' ', '_') for s in string_list]

In [None]:
def keep_common_genres(df):
    # Step 1: Flatten the lists of genres
    flat_genres = list(chain.from_iterable(df['Genres']))

    # Step 2: Count the occurrences of each genre
    genre_counts = Counter(flat_genres)

    # Step 3: Select the top 20 genres
    top_genres = [genre for genre, count in genre_counts.most_common(20)]

    # Step 4: Filter the original lists of genres
    df['Genres'] = df['Genres'].apply(lambda genres: [genre for genre in genres if genre in top_genres])

In [None]:
def create_wordcloud(data, mask, color):
    # Flatten the lists of genres
    flat_genres = [genre for genres in data['Genres'] for genre in genres]

    # Count the occurrences of each genre
    genre_counts = pd.Series(flat_genres).value_counts()

    # Create a WordCloud object with the frequencies
    wordcloud = WordCloud(background_color='white', mask=mask, height=800, width=600, contour_width = 3,\
    contour_color = color).generate_from_frequencies(genre_counts)

    return wordcloud

In [None]:
def transform_mask(imagename):
    mask = np.array(Image.open(imagename))
    # Transform 3d image to 2d for easier visualisation
    mask = mask[:,:,0]

    def transform_format(val):
        if val.any()== 0:
            return 255
        else:
            return val

    # Transform mask into a new one that will work with the function
    transformed_mask = np.ndarray((mask.shape[0],mask.shape[1]), np.int32)

    for i in range(len(mask)):
        transformed_mask[i] = list(map(transform_format,mask[i]))

    return transformed_mask

In [None]:
def prepare_regression_df(original_df):
    # Pivot the DataFrame to separate primary and secondary roles
    pivot_df = original_df.pivot(index='Wiki ID', columns='Role', values=['Actor gender'])

    # Flatten the MultiIndex columns
    pivot_df.columns = ['_'.join(col).strip() for col in pivot_df.columns.values]

    # Reset the index to make movie_id a regular column
    pivot_df.reset_index(inplace=True)

    # Merge with the original DataFrame on movie_id
    final_df = pd.merge(original_df[['Wiki ID', 'Revenue', 'IMDb rating']], pivot_df, on='Wiki ID')

    # Rename columns for clarity
    final_df = final_df.rename(columns={'Actor gender_Primary': 'Primary gender', 'Actor gender_Secondary': 'Secondary gender'})

    # Change genders to numerical values
    final_df['Primary gender'] = final_df['Primary gender'].replace({'M': 1, 'F': 0})
    final_df['Secondary gender'] = final_df['Secondary gender'].replace({'M': 1, 'F': 0})

    return final_df

In [None]:
def linear_regression(data, input_columns, output_column):
    """
    Perform linear regression on the specified DataFrame.

    Parameters:
    - data (pd.DataFrame): The input DataFrame.
    - input_columns (list): List of column names to be used as input features.
    - output_column (str): The column name to be used as the output variable.

    Returns:
    - results (statsmodels.regression.linear_model.RegressionResults): Regression results summary.
    """

    # Select input and output columns
    X = data[input_columns]
    y = data[output_column]

    # Add a constant term to the input features for the intercept
    X = stm.add_constant(X)

    # Fit the linear regression model
    model = stm.OLS(y, X)
    results = model.fit()

    # Print a summary of the regression results
    print(results.summary())

    return results

In [None]:
def filter_n_common(data, n, column):
    # Algorithm to perform if the column is a column of genres
    if column == 'Genres':
        # Count genre occurrences
        genre_counts = data['Genres'].explode().value_counts()

        # Get the top N genres
        top_n_genres = genre_counts.head(n).index.tolist()

        # Function to filter genres in a list
        def filter_genres(genre_list):
            return [genre for genre in genre_list if genre in top_n_genres]

        # Apply the filtering function to the 'Genres' column
        data['Genres'] = data['Genres'].apply(filter_genres)

        # Remove rows with empty 'Genre' lists
        data = data[data['Genres'].astype(bool)]

        return data
    
    # Algorithm to perform if the column contains other values e.g. countries or languages
    else:
        # Find the n most frequent values
        top_values = data[column].value_counts().nlargest(n).index.tolist()
        print(top_values)

        # Filter the DataFrame to keep rows with one of the top values
        filtered_df = data[data[column].isin(top_values)]

        return filtered_df

In [None]:
def plot_gender_revenue_difference(df, column):
    """
    Plot a histogram for each genre showing the average revenue for movies
    with male and female primary actors, and the difference between them.

    Parameters:
    - df: DataFrame containing columns 'Genre', 'Gender', and 'Revenue'.
    """

    # Explode the genres to have one row per genre (so that movies with several count for all of their genres)
    df_duplicated = df.explode('Genres')

    # Group by Genre and Gender and calculate the average revenue and standard deviation
    grouped_df = df_duplicated.groupby(['Genres', 'Gender']).agg({column: ['mean', 'std']}).reset_index()

    # Pivot the DataFrame to have Gender as columns
    pivot_df = grouped_df.pivot(index='Genres', columns='Gender').reset_index()

    # Calculate the difference between male and female average revenues
    pivot_df['Difference'] = pivot_df[column]['mean']['Male'] - pivot_df[column]['mean']['Female']

    # Define the order of genres
    genre_order = df_duplicated['Genres'].unique()

    # Plot the bar chart with error bars only on the difference bars
    fig, ax1 = plt.subplots(figsize=(12, 8))

    # Plot bars for male and female average revenues on the primary y-axis
    sns.barplot(x='Genres', y=column, hue='Gender', data=df_duplicated, ax=ax1, palette={'Male': 'blue', 'Female': 'pink'}, order=genre_order)

    # Create a secondary y-axis for the difference bars
    ax2 = ax1.twinx()

    # Plot the bar for the difference in average revenues on the secondary y-axis with error bars
    sns.barplot(x='Genres', y='Difference', data=pivot_df, ax=ax2, color='gray', alpha=0.5, order=genre_order)

    # Set the correct bounds for axes depending on column
    if column == 'Revenue':
        ax1.set_ylim(bottom=0, top=20e7)
        ax2.set_ylim(bottom=-10e7, top=6e7)
        ax2.grid(False)
    else:
        ax1.set_ylim(bottom=0, top=20)
        ax2.set_ylim(bottom=-1.25, top=.75)
        ax2.grid(False)

    # Move the legends to the top left corner
    ax1.legend(loc='upper left')

    # Set legend for the secondary y-axis manually
    ax2.legend([ax2.patches[0]], ['Difference'], loc='upper right')

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45, ha='right')

    # Add labels and title
    ax1.set_xlabel('Genres')
    ax1.set_ylabel('Average ' + column + ' (Male/Female)', color='black')
    ax2.set_ylabel('Difference in Average ' + column, color='gray')
    ax1.set_title('Average ' + column + ' by Genre and Gender')

    '''
    # Print the heights of the bars on the primary y-axis
    for p in ax1.patches:
        print(f"Height of bar at {p.get_x() + p.get_width() / 2}: {p.get_height()}")
    print(" ")
    for p in ax2.patches:
        print(f"Height of bar at {p.get_x() + p.get_width() / 2}: {p.get_height()}")
    '''

    plt.show()

In [None]:
# Useful functions
def number_similarity(score1, score2):
    return 1 - np.abs(score1 - score2)

def standardise(col):
    Z = (col - col.mean()) / col.std()
    return Z

In [None]:
def matching_algorithm(data, treatment, control):
    # Create an empty undirected graph
    G = nx.Graph()

    # Loop through all the pairs of instances
    for control_id, control_row in control.iterrows():
        for treatment_id, treatment_row in treatment.iterrows():

            # Calculate the similarity
            sim = number_similarity(treatment_row['Score'], control_row['Score'])

            # Add an edge between the two instances weighted by the similarity between them
            G.add_weighted_edges_from([(control_id, treatment_id, sim)])

    # Generate and return the maximum weight matching on the generated graph
    matching = nx.max_weight_matching(G)

    matched = [i[0] for i in matching] + [i[1] for i in matching]

    return data.iloc[matched]

In [None]:
def plot_boxplots(df1, df2, group_column, output_1, output_2):
    """
    Plot boxplots for treatment and control groups from two DataFrames.

    Parameters:
    - df1: First DataFrame
    - df2: Second DataFrame
    - group_column: Column indicating treatment (1) or control (0)
    - output_column: Column to be plotted
    """

    # Create subplots
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))

    # Plot boxplot for df1
    sns.boxplot(x=group_column, y=output_1, data=df1, ax=axes[0,0])
    axes[0,0].set_title('Unmatched data')
    axes[0,0].set_xticklabels(['Male', 'Female'])

    sns.boxplot(x=group_column, y=output_2, data=df1, ax=axes[1,0])
    axes[1,0].set_title('Unmatched data')
    axes[1,0].set_xticklabels(['Male', 'Female'])

    # Plot boxplot for df2
    sns.boxplot(x=group_column, y=output_1, data=df2, ax=axes[0,1])
    axes[0,1].set_title('Matched data')
    axes[0,1].set_xticklabels(['Male', 'Female'])

    sns.boxplot(x=group_column, y=output_2, data=df2, ax=axes[1,1])
    axes[1,1].set_title('Unmatched data')
    axes[1,1].set_xticklabels(['Male', 'Female'])

    # Adjust layout
    plt.tight_layout()

    plt.show()