In [63]:
import pandas as pd
import numpy as np
import plotly
import dataset
import statsmodels as sts
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.0)
plt.rcParams['grid.linewidth'] = 0.5

# Remaining:

[ ] Aspect radar plots for each community <br>
[ ] Polarity Discussion <br>
[ ] Hypothesis Testing <br>

# Dataset Loading

Keep only necessary dataset(s)

In [64]:
augmented_cmu = dataset.get_augmented_cmu()
imdb_df = dataset.get_imdb_dataset()


Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.



### Community Labels

In [65]:
community_dict = {0:'Romantic Movies & Social Commentary (C0)', 1:'Diverse Drama & Action (C1)', 2:'Lighthearted Entertainment (C2)', 3:'Dark & Suspenseful Fiction (C3)', 4:'Historical & Cultural Narratives (C4)'}

movie_community = pd.read_csv('Output/cmu_community_assignment.csv').drop(columns=['Unnamed: 0'])
movie_community['community_label'] = movie_community['hard_assignment'].apply(lambda x: community_dict.get(x))
cmu_community = pd.merge(left=augmented_cmu, right=movie_community, on='movie_wikipedia_id', how='inner')

Get community assignments and labels

### Customize Dataset

To prevent merge errors later, we should always initialize the dataset used in this way. Then, everything below this cell is "guaranteed" to run.

### Reviews Dataset

In [66]:
from scipy.interpolate import interp1d
import pandas as pd
import numpy as np
import ast

# Load the data
reviews = pd.read_csv('Output/reviews_processed.csv').drop(columns=['Unnamed: 0'])

# Function to convert string representation of a list to an actual list
def string_to_list(string):
    if isinstance(string, str):
        try:
            return ast.literal_eval(string)
        except ValueError:
            return np.nan
    return np.nan 

# Function to normalize sentiment trajectory to a fixed number of points
def normalize_sentiment_trajectory(sentiment_list, num_points=100):
    if len(sentiment_list) < 2:
        return np.nan
    
    # Create a list of equally spaced indices
    original_indices = np.linspace(0, len(sentiment_list) - 1, num=len(sentiment_list))
    new_indices = np.linspace(0, len(sentiment_list) - 1, num=num_points)
    
    # Interpolate the sentiment_list to the new number of points
    interp_func = interp1d(original_indices, sentiment_list, kind='linear')
    new_sentiment_list = interp_func(new_indices)
    
    return new_sentiment_list


reviews['sentiment_trajectories'] = reviews['sentiment_trajectories'].apply(string_to_list)
reviews['normalized_trajectories'] = reviews['sentiment_trajectories'].apply(normalize_sentiment_trajectory)


#### Aspect Based Sentiment Analysis Dataset

In [67]:
import pandas as pd
import numpy as np
import dataset
import os

# @Function Aspect Based Sentiment Analysis
def get_absa_dataset():
    # Load concatenated datasets -> two folders due to csv formatting
    absa_reviews1 = dataset.concatenate_folder('Output/openai_absa/')
    absa_reviews2 = dataset.concatenate_folder('Output/openai_absa2/')
    # @Function Structure raw response data
    def extract_aspect_sentiments(response, bool_):
        output = {}
        if bool_:
            aspects = response.split('\n')
        else:
            aspects = response.split('\\n')  
        aspects = [x.split('"') for x in aspects]
        aspects = [{x[1].lower(): x[3].lower()} for x in aspects if len(x) == 5]
        [output.update(d) for d in aspects]
        return output
    # Process raw response from OpenAI API
    absa_reviews1['aspect_sentiments'] = absa_reviews1['answer'].apply(lambda x: extract_aspect_sentiments(x,True))
    absa_reviews2['aspect_sentiments'] = absa_reviews2['answer'].apply(lambda x: extract_aspect_sentiments(x,False))
    
    def get_absa_from_txt(root):
        raw_data = []
        for file in os.listdir(root):
            path = f'{root}{file}'
            with open(path, 'r') as f:
                lines = f.readlines()
                raw_data = raw_data + lines
                
        chunks = [x.split(':') for x in raw_data] 
        output = []
        for c in chunks:
            review_id = c[1].split("'")[1]
            aspect_sentiments = c[2].split('\\n')
            aspect_sentiments = [x[x.find('['):][:x.find(']') +1] for x in aspect_sentiments]
            aspect_sentiments = [x.split('"') for x in aspect_sentiments]
            if len(aspect_sentiments) > 1:
                aspect_sentiments = [{x[1].lower(): x[3].lower()} for x in aspect_sentiments]
                combined_dict = {}
                [combined_dict.update(d) for d in aspect_sentiments]
                output.append({'review_id': review_id, 'aspect_sentiments': combined_dict})
                
        return pd.DataFrame(output)
    
    absa_reviews3 = get_absa_from_txt('Output/openai_absa3/')
    
    absa_reviews = pd.concat(objs=[absa_reviews1, absa_reviews2, absa_reviews3], axis=0)
    # Drop raw-column and normalize json
    absa_reviews = absa_reviews.drop(columns='answer')
    aspect_sentiments = pd.json_normalize(absa_reviews['aspect_sentiments'])
    # Reset index before concatenation
    absa_reviews = absa_reviews.reset_index(drop=True)
    aspect_sentiments = aspect_sentiments.reset_index(drop=True)
    # Concatenate dataframes and keep relevant columns
    processed_absa = pd.concat([absa_reviews.drop('aspect_sentiments', axis=1), aspect_sentiments], axis=1)
    processed_absa = processed_absa[['review_id', 'plot', 'cast', 'production', 'visuals', 'soundplay', 'dialogue', 'originality']]
    # Manual corrections
    aspects = list(processed_absa.columns)
    aspects.remove('review_id')
    processed_absa = processed_absa.dropna(subset=aspects, how='all')
    
    def correct_label_mistake(label):
        correct_labels = ['mixed', 'negative', 'neutral', 'not discussed', 'positive']
        if pd.isnull(label):
            return np.nan
        elif label in correct_labels:
            return label
        else:
            for c in correct_labels:
                if label.find(c) != -1:
                    return c
            return label
        
    for a in aspects:
        processed_absa[a] = processed_absa[a].apply(correct_label_mistake)
    
    processed_absa = processed_absa.replace(np.nan, 'not discussed') # Correcting response formatting issue (9 rows)
    processed_absa = processed_absa.replace('satisfactory', 'neutral') # Correcting label mistake
    return processed_absa.reset_index(drop=True)

# Call the function to get the dataset
processed_absa = get_absa_dataset()

In [68]:
review_df = cmu_community.copy()[['imdb_id', 'hard_assignment', 'runtime','languages','topic','mood','target_audience','temporal_setting','location_setting',]]
review_df['community_labels'] = review_df['hard_assignment'].apply(lambda x: community_dict.get(x))

# MERGE FOR IMDB DATASET
review_df = pd.merge(left=review_df, right=imdb_df, how='inner', on='imdb_id')

# MERGE FOR REVIEWS
merge_df = pd.merge(left=reviews, right=processed_absa, how='inner', on='review_id')
review_df = pd.merge(left=review_df, right=merge_df, how='inner', on=['imdb_id']).sort_values(by='hard_assignment', ascending=True)

review_df['release_year'] = review_df['release_year'].astype(int)

review_df = review_df[(review_df['release_year'] >= 1960) & (review_df['release_year'] <= 2014)]

In [69]:
review_df.columns

Index(['imdb_id', 'hard_assignment', 'runtime', 'languages', 'topic', 'mood',
       'target_audience', 'temporal_setting', 'location_setting',
       'community_labels', 'title_type', 'movie_name', 'is_adult',
       'release_year', 'runtime_minutes', 'genres', 'avg_rating', 'num_votes',
       'review_id', 'rating', 'review_title', 'review_body', 'found_helpful',
       'reactions', 'title_word_count', 'body_word_count', 'review_text',
       'helpfulness_ration', 'subjectivity_score', 'readability',
       'flair_sentiment', 'sentiment_trajectories', 'normalized_trajectories',
       'plot', 'cast', 'production', 'visuals', 'soundplay', 'dialogue',
       'originality'],
      dtype='object')

### Use this for data loading:

In [70]:
review_df.to_csv('Output/level_analysis/review_data.csv')

# Basic Visualizations

Some basic dataviz to illustrate distributions, label counts, etc. Just to describe the dataset

##### Movies per Community

In [71]:
import plotly.express as px

# Aggregate the count of 'POSITIVE' and 'NEGATIVE' reviews for each community
sentiment_count = review_df.groupby(['hard_assignment', 'flair_sentiment']).size().unstack(fill_value=0)
sentiment_count = sentiment_count[['POSITIVE', 'NEGATIVE']]  # Ensure correct column order

# Create a stacked bar chart
fig = px.bar(sentiment_count, 
             title="POSITIVE and NEGATIVE Reviews per Community",
             labels={'value':'Number of Reviews', 'hard_assignment':'Community'},
             text_auto=True)

# Update layout for clarity
fig.update_layout(barmode='stack',
                  xaxis_title="Community",
                  yaxis_title="Number of Reviews",
                  legend_title="Sentiment")

# Show the figure
fig.show()

In [72]:
# Group the data by 'release_year' and 'flair_sentiment', then count the occurrences
yearly_sentiment_count = review_df.groupby(['release_year', 'flair_sentiment']).size().reset_index(name='count')

# Filter out the rows where sentiment is neither POSITIVE nor NEGATIVE
yearly_sentiment_count = yearly_sentiment_count[yearly_sentiment_count['flair_sentiment'].isin(['POSITIVE', 'NEGATIVE'])]

# Pivot the data to get separate columns for POSITIVE and NEGATIVE sentiment counts per year
yearly_sentiment_pivot = yearly_sentiment_count.pivot(index='release_year', columns='flair_sentiment', values='count').fillna(0)

# Create the stacked bar chart using Plotly Express
fig = px.bar(yearly_sentiment_pivot, x=yearly_sentiment_pivot.index, y=['POSITIVE', 'NEGATIVE'],
             labels={'value':'Number of Reviews', 'release_year':'Release Year'},
             title='Number of Positive and Negative Reviews per Year')

# Update the layout to stack the bars
fig.update_layout(barmode='stack')

# Show the plot
fig.show()

In [81]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import scipy
import numpy as np

# Create subplots for each community
fig = make_subplots(rows=1, cols=5, subplot_titles=[f"{community_dict[i]}" for i in range(5)])

# Define function to calculate mean and 95% CI across arrays of lists
def mean_confidence_interval(data, confidence=0.95):
    n = len(data)
    m, se = np.mean(data, axis=0), scipy.stats.sem(data, axis=0)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h

# Assuming 'normalized_trajectories' column contains lists or arrays of numbers
# And 'hard_assignment' and 'flair_sentiment' are the community and sentiment columns, respectively
for community in sorted(review_df['hard_assignment'].unique()):
    # Filter the dataframe for the current community and positive/negative sentiment
    positive_reviews = review_df[(review_df['hard_assignment'] == community) & (review_df['flair_sentiment'] == 'POSITIVE') & (review_df['normalized_trajectories'].notnull())]
    negative_reviews = review_df[(review_df['hard_assignment'] == community) & (review_df['flair_sentiment'] == 'NEGATIVE') & (review_df['normalized_trajectories'].notnull())]
    
    # Assuming 'normalized_trajectories' is a list of lists for positive and negative reviews
    pos_data = np.array(positive_reviews['normalized_trajectories'].tolist())
    neg_data = np.array(negative_reviews['normalized_trajectories'].tolist())
    
    # Calculate the mean and 95% CI for positive and negative reviews
    pos_mean, pos_ci_lower, pos_ci_upper = mean_confidence_interval(pos_data)
    neg_mean, neg_ci_lower, neg_ci_upper = mean_confidence_interval(neg_data)
    
    # Add traces for the mean sentiment trajectory
    fig.add_trace(go.Scatter(x=np.arange(len(pos_mean)), y=pos_mean,
                             mode='lines', name=f'Positive {community_dict.get(community)}',
                             line=dict(color='blue')),
                  row=1, col=community+1)
    fig.add_trace(go.Scatter(x=np.arange(len(neg_mean)), y=neg_mean,
                             mode='lines', name=f'Negative {community_dict.get(community)}',
                             line=dict(color='red')),
                  row=1, col=community+1)
    
    # Add traces for the confidence intervals
    fig.add_trace(go.Scatter(x=np.arange(len(pos_mean)), y=pos_ci_upper,
                             mode='lines', name=f'Upper CI Pos {community_dict.get(community)}',
                             marker=dict(color="#444"), line=dict(width=0),
                             showlegend=False), row=1, col=community+1)
    fig.add_trace(go.Scatter(x=np.arange(len(pos_mean)), y=pos_ci_lower,
                             mode='lines', name=f'Lower CI Pos {community_dict.get(community)}',
                             marker=dict(color="#444"), line=dict(width=0),
                             fillcolor='rgba(68, 68, 255, 0.3)', fill='tonexty',
                             showlegend=False), row=1, col=community+1)
    
    fig.add_trace(go.Scatter(x=np.arange(len(neg_mean)), y=neg_ci_upper,
                             mode='lines', name=f'Upper CI Neg {community_dict.get(community)}',
                             marker=dict(color="#444"), line=dict(width=0),
                             showlegend=False), row=1, col=community+1)
    fig.add_trace(go.Scatter(x=np.arange(len(neg_mean)), y=neg_ci_lower,
                             mode='lines', name=f'Lower CI Neg {community_dict.get(community)}',
                             marker=dict(color="#444"), line=dict(width=0),
                             fillcolor='rgba(255, 68, 68, 0.3)', fill='tonexty',
                             showlegend=False), row=1, col=community+1)

# Update layout
fig.update_layout(height=400, width=1600, title_text="Normalized Sentiment Trajectories by Community",
                  legend_title_text='Sentiment', showlegend=False)


In [111]:
aspect_analysis = review_df[['imdb_id','hard_assignment', 'review_id', 'plot', 'cast', 'production', 'visuals', 'soundplay',  'dialogue', 'originality']].reset_index(drop=True)
aspect_assignment_info = aspect_analysis[['imdb_id','hard_assignment', 'review_id']]
aspect_details = aspect_analysis[['review_id', 'plot', 'cast', 'production', 'visuals', 'soundplay',  'dialogue', 'originality']]

melted_aspect = aspect_details.melt(id_vars=['review_id'], var_name='aspect', value_name='sentiment')
frequency_df = melted_aspect.groupby(['aspect', 'sentiment']).size().reset_index(name='frequency')

import plotly.express as px

fig = px.line_polar(frequency_df, r='frequency', theta='aspect', color='sentiment', 
                    line_close=True, 
                    title='Aspect-Based Sentiment Analysis in IMDb Reviews')
fig.show()


# Plot Feature Analysis

Linear Regression can possibly go here

In [74]:
# Insert code here

# Map Plotting

Define and describe plot

In [75]:
# Insert code here

# Hypothesis Testing

Define and complete hypothesis testing

### Scatterplots for First Impressions

In [76]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

def create_synced_scatterplots(df, selected_assignment=None):
    """
    Create scatterplots in a subplot with revenue vs avg_rating and revenue vs num_votes.
    Points are colored by 'hard_assignment', with the option to highlight a specific 'hard_assignment'.

    Parameters:
    df (DataFrame): A DataFrame containing 'hard_assignment', 'revenue', 'num_votes', 'avg_rating' columns.
    selected_assignment (int, optional): The 'hard_assignment' value to highlight.
    """
    
    # Define the color sequence to use for the plots
    color_sequence = px.colors.qualitative.Plotly

    # Map hard_assignment to a color
    assignment_colors = {assignment: color for assignment, color in zip(df['hard_assignment'].unique(), color_sequence)}

    # Create subplots
    fig = make_subplots(rows=1, cols=3, subplot_titles=("Revenue vs. Average Rating", "Revenue vs. Number of Votes", "Revenue vs. Runtime"))

    # Loop through each hard_assignment value and create traces for both subplots
    for assignment, color in assignment_colors.items():
        # Define the opacity based on whether this assignment is selected or not
        opacity = 1.0 if selected_assignment is None or selected_assignment == assignment else 0.1

        # Filter the DataFrame for the current assignment
        df_filtered = df[df['hard_assignment'] == assignment]

        # Add traces for Revenue vs. Average Rating
        fig.add_trace(
            go.Scatter(
                x=df_filtered['revenue'], 
                y=df_filtered['avg_rating'], 
                mode='markers', 
                marker=dict(color=color, opacity=opacity),
                name=community_dict.get(assignment),
                legendgroup=f'group{assignment}'
            ),
            row=1, col=1
        )

        # Add traces for Revenue vs. Number of Votes
        fig.add_trace(
            go.Scatter(
                x=df_filtered['revenue'], 
                y=df_filtered['num_votes'], 
                mode='markers', 
                marker=dict(color=color, opacity=opacity),
                name=community_dict.get(assignment),
                legendgroup=f'group{assignment}',
                showlegend=False
            ),
            row=1, col=2
        )
        
        # Add traces for Revenue vs. Number of Votes
        fig.add_trace(
            go.Scatter(
                x=df_filtered['revenue'], 
                y=df_filtered['runtime'], 
                mode='markers', 
                marker=dict(color=color, opacity=opacity),
                name=community_dict.get(assignment),
                legendgroup=f'group{assignment}',
                showlegend=False
            ),
            row=1, col=3
        )

    # Update x-axis and y-axis labels
    fig.update_xaxes(title_text="Revenue", row=1, col=1)
    fig.update_yaxes(title_text="Average Rating", row=1, col=1)
    fig.update_xaxes(title_text="Revenue", row=1, col=2)
    fig.update_yaxes(title_text="Number of Votes", row=1, col=2)

    # Update layout to show discrete color legend
    fig.update_layout(
        title_text="Revenue Comparisons with Highlighting 'hard_assignment'",
        showlegend=True,
        legend_title_text='Hard Assignment'
    )

    return fig

create_synced_scatterplots(revenue_df)

NameError: name 'revenue_df' is not defined

**Comments:** Interesting, but we're unable to see what is actually happening in the range 0-1B in revenue... Let's apply a log-transform on revenue, number of votes and runtime to dampen the effects of extreme values. 

In [None]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import numpy as np

def create_synced_scatterplots(df, selected_assignment=None):
    """
    Create scatterplots in a subplot with log-transformed revenue vs avg_rating and revenue vs num_votes.
    Points are colored by 'hard_assignment', with the option to highlight a specific 'hard_assignment'.

    Parameters:
    df (DataFrame): A DataFrame containing 'hard_assignment', 'revenue', 'num_votes', 'avg_rating' columns.
    selected_assignment (int, optional): The 'hard_assignment' value to highlight.
    """
    
    # Define the color sequence to use for the plots
    color_sequence = px.colors.qualitative.Plotly

    # Map hard_assignment to a color
    assignment_colors = {assignment: color for assignment, color in zip(df['hard_assignment'].unique(), color_sequence)}

    # Create subplots
    fig = make_subplots(rows=1, cols=3, subplot_titles=("Log Revenue vs. Average Rating", "Log Revenue vs. Log Number of Votes", "Log Revenue vs. Log Runtime"))

    # Loop through each hard_assignment value and create traces for both subplots
    for assignment, color in assignment_colors.items():
        # Define the opacity based on whether this assignment is selected or not
        opacity = 1.0 if selected_assignment is None or selected_assignment == assignment else 0.1

        # Filter the DataFrame for the current assignment
        df_filtered = df[df['hard_assignment'] == assignment]

        # Add traces for Log Revenue vs. Average Rating
        fig.add_trace(
            go.Scatter(
                x=np.log10(df_filtered['revenue']),  # Apply log transformation
                y=df_filtered[df_filtered['avg_rating'] > 0.0]['avg_rating'], 
                mode='markers', 
                marker=dict(color=color, opacity=opacity),
                name=community_dict.get(assignment),
                legendgroup=f'group{assignment}'
            ),
            row=1, col=1
        )

        # Add traces for Log Revenue vs. Number of Votes
        fig.add_trace(
            go.Scatter(
                x=np.log10(df_filtered['revenue']),  # Apply log transformation
                y=np.log10(df_filtered['num_votes']), 
                mode='markers', 
                marker=dict(color=color, opacity=opacity),
                name=community_dict.get(assignment),
                legendgroup=f'group{assignment}',
                showlegend=False
            ),
            row=1, col=2
        )
        
        # Add traces for Log Revenue vs. Runtime
        fig.add_trace(
            go.Scatter(
                x=np.log10(df_filtered['revenue']),  # Apply log transformation
                y=np.log10(df_filtered['runtime']), 
                mode='markers', 
                marker=dict(color=color, opacity=opacity),
                name=community_dict.get(assignment),
                legendgroup=f'group{assignment}',
                showlegend=False
            ),
            row=1, col=3
        )

    # Update x-axis and y-axis labels
    fig.update_xaxes(title_text="Log Revenue", row=1, col=1)
    fig.update_yaxes(title_text="Average Rating", row=1, col=1)
    fig.update_xaxes(title_text="Log Revenue", row=1, col=2)
    fig.update_yaxes(title_text="Log Number of Votes", row=1, col=2)
    fig.update_xaxes(title_text="Log Revenue", row=1, col=3)
    fig.update_yaxes(title_text="Log Runtime", row=1, col=3)

    # Update layout to show discrete color legend
    fig.update_layout(
        title_text="Revenue Comparisons with Highlighting 'hard_assignment' (Log Scale)",
        showlegend=True,
        legend_title_text='Hard Assignment'
    )

    return fig

# Replace 'revenue_df' with your actual DataFrame containing the necessary columns
# fig = create_synced_scatterplots(revenue_df)
# fig.show()

create_synced_scatterplots(revenue_df)

## Community Analysis on Selected Features

Assumption testing framework:
1. Check for normality within each community (Shapiro-Wilt)
2. Test for homogeneity of variances across communities for provided feature
3. Kruskal-Wallis H-Test
4. ANOVA H-Test 

In [None]:
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import AnovaRM

def test_assumptions_and_anova(df, feature):
    """
    Test the assumptions for ANOVA and perform Welch's ANOVA on a feature across different communities.

    Parameters:
    df (DataFrame): A DataFrame containing 'hard_assignment' and the feature of interest.
    feature (str): The name of the feature to test.

    Returns:
    None: Prints the results of the tests.
    """
    
    # Test for normality within each community for the feature
    normality_results = {}
    for group in df['hard_assignment'].unique():
        group_data = df[df['hard_assignment'] == group][feature]
        normality_results[group] = stats.shapiro(group_data)

    # Test for homogeneity of variances (Brown-Forsythe Test)
    brown_forsythe_result = stats.levene(
        *[df[df['hard_assignment'] == group][feature] for group in df['hard_assignment'].unique()],
        center='median'
    )
    
    # Prepare the data for each group
    groups_data = [group[feature].values for name, group in df.groupby('hard_assignment')]

    # Perform Kruskal-Wallis H-test
    kruskal_result = stats.kruskal(*groups_data)

    # Welch's ANOVA (using OLS regression and AnovaRM)
    model = ols(f'{feature} ~ C(hard_assignment)', data=df).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)

    # Displaying results
    print(f'Assumption evaluation for {feature}')
    print(f'- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \n')
    print(f'Normality results (Shapiro-Wilk):')
    [print(f'Community {key}: p-value = {value[1]}') for key, value in normality_results.items()]
    print()
    
    bf_result = {'statistic': brown_forsythe_result[0], 'pvalue': brown_forsythe_result[1]}
    kw_result = {'statistic': kruskal_result[0], 'pvalue': kruskal_result[1]}
    print(f'Homogeneity results (Brown-Forsythe): {bf_result}\n')
    print(f'Variance H-Test (Kruskal-Wallis): {kw_result}\n')
    print(f'Welch’s ANOVA:')
    print(anova_table)
    print(f'\n- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -')
    print(f'\nNOTE: Be cautious about ANOVA results if assumptions (normality and homogeneity) are violated.')
    print(f'\n- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -')


In [None]:
def create_box_plots(df, feature, log_transform=False):
    """
    Create box plots for a specified feature with one box per community assignment
    and an additional box for the whole dataset. Optionally apply a log transform to the data.

    Parameters:
    df (DataFrame): A DataFrame containing 'hard_assignment' and the feature of interest.
    feature (str): The name of the feature to plot.
    log_transform (bool): If True, apply a log transformation to the feature data.

    Returns:
    fig (Figure): A Plotly figure object containing the box plots.
    """
    copy_df = df.copy()
    
    # Apply log transformation if requested
    if log_transform:
        # Replace 0 and negative values with a small positive value to avoid log(0)
        copy_df[feature] = copy_df[feature].apply(lambda x: max(x, 1e-4))
        copy_df[feature] = np.log(copy_df[feature])

    # Create a box plot for each community
    fig = go.Figure()
    for assignment in sorted(copy_df['hard_assignment'].unique()):
        fig.add_trace(go.Box(
            y=copy_df[copy_df['hard_assignment'] == assignment][feature],
            name=community_dict.get(assignment)
        ))

    # Add a box plot for the whole dataset
    fig.add_trace(go.Box(
        y=copy_df[feature],
        name='All Communities'
    ))

    title = f"Box Plot of '{feature.capitalize()}' by Community Assignment"
    
    if log_transform:
        title = f"Box Plot of 'Log {feature.capitalize()}' by Community Assignment"
    # Update layout
    fig.update_layout(
        title=title,
        yaxis_title=feature,
        boxmode='group'  # Display boxes grouped together
    )

    return fig


In [None]:
import pandas as pd
import scikit_posthocs as sp
import plotly.graph_objects as go

def plot_dunns_test(df, feature, group_col='hard_assignment'):
    """
    Perform Dunn's test for pairwise comparisons and plot the results using Plotly.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    feature (str): The name of the feature for comparison.
    group_col (str): The name of the column containing group labels.

    Returns:
    Plotly Figure: A figure displaying the pairwise comparison results.
    """
    
    # Perform Dunn's test
    data = [group[feature].values for name, group in df.groupby(group_col)]
    dunn_result = sp.posthoc_dunn(data, p_adjust='bonferroni')

    # Get unique group names
    unique_groups = df[group_col].unique()
    
    # Preparing data for plot
    x_data = []
    y_data = []
    hover_text = []
    for i, label1 in enumerate(unique_groups):
        for j, label2 in enumerate(unique_groups):
            if i < j:
                x_data.append(f'{label1} - {label2}')
                y_data.append(dunn_result.iloc[i, j])
                hover_text.append(f'p-value: {dunn_result.iloc[i, j]:.3f}')

    # Creating the plot
    fig = go.Figure(data=[go.Bar(x=x_data, y=y_data, text=hover_text, hoverinfo='text')])
    
    # Customizing the plot
    fig.update_layout(title=f"Dunn’s Test for '{feature}'",
                      xaxis_title='Pairwise Comparison',
                      yaxis_title='P-Value',
                      yaxis_type='log',
                      showlegend=False)
    return fig

# Example usage
# fig = plot_dunns_test(revenue_df, 'revenue', 'hard_assignment')
# fig.show()


### Revenue

In [None]:
create_box_plots(revenue_df, 'revenue', log_transform=True)

NOTE: for visual purposes, a log-transform was applied on the feature assessed in this boxplot.

Implications of Log-Transform in Boxplots:

- When a log-transform is applied to the data, the median values and the interquartile range depicted in the boxplot represent the median and spread of the logarithm of the original data.
The differences between the medians of any two communities on the log scale translate to multiplicative differences on the original scale. For instance, a difference of 1 on a log scale implies a tenfold difference in the original scale if the log is base 10.
- The spread of the data (the interquartile range) shown in the boxplot reflects relative variation, not absolute variation. This means that a similar visual spread between two different communities can represent very different absolute spreads in terms of the original revenue values.
- Outliers in log-transformed data can indicate that the original data have a heavy-tailed distribution, with extreme values far from the median.

The remaining analysis is done in linear space.

#### Evaluating Significance

In [None]:
test_assumptions_and_anova(revenue_df, 'revenue')

Assumption evaluation for revenue
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

Normality results (Shapiro-Wilk):
Community 0: p-value = 0.0
Community 1: p-value = 0.0
Community 2: p-value = 0.0
Community 3: p-value = 0.0
Community 4: p-value = 5.490147727810003e-26

Homogeneity results (Brown-Forsythe): {'statistic': 45.00922111100122, 'pvalue': 2.120025847716452e-37}

Variance H-Test (Kruskal-Wallis): {'statistic': 311.1763154210369, 'pvalue': 4.2041729288328156e-66}

Welch’s ANOVA:
                          sum_sq      df         F        PR(>F)
C(hard_assignment)  2.500228e+18     4.0  49.33496  5.031865e-41
Residual            9.291925e+19  7334.0       NaN           NaN

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

NOTE: Be cautious about ANOVA results if assumptions (normality and homogeneity) are violated.

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


The results from the assumption tests for the ‘revenue’ feature across different communities indicate significant deviations from the assumptions required for a traditional ANOVA. The Shapiro-Wilk test shows p-values effectively at 0 for all communities, strongly suggesting that the revenue data are not normally distributed in any of the groups. The Brown-Forsythe test yields a very low p-value, indicating that the variances across different communities are not homogeneous. Given these violations of ANOVA assumptions, the Welch’s ANOVA results, despite showing a significant F-statistic, should be interpreted with caution. The Kruskal-Wallis test, a non-parametric alternative, shows a highly significant result, indicating that there are differences in revenue distributions across the communities. Based on these findings, it is recommended to rely on the Kruskal-Wallis test for analyzing differences in revenue across communities and to proceed with a non-parametric post hoc analysis, such as Dunn’s test, for detailed pairwise comparisons. Additionally, calculating effect sizes, like epsilon-squared, will be beneficial to understand the magnitude of these differences.

#### Effect Size and Post Hoc Analysis

In [None]:
plot_dunns_test(revenue_df, 'revenue')

**Observations:** high p-value for 0-1 -> the comparison between group '0' and group '1' shows a p-value of 1.000, which is far above the common alpha level of 0.05. This suggests that there is no statistically significant difference in median revenue between these two groups. All other pairwise comparisons show p-values of 0.000, which suggests strong evidence against the null hypothesis. This indicates statistically significant differences in median revenue between the groups being compared.

**Conclusions:** Based on this test, we can conclude that there's no evidence to suggest a difference in the median revenue between groups 0 and 1. For all other group comparisons, the results suggest significant differences in median revenue. The exact nature of these differences (e.g., which group has higher revenue) would require further investigation or descriptive statistics to clarify.

**Considerations:** 
- The more comparisons you make, the greater the chance of a Type I error (false positive). Bonferroni correction has been applied, which is good practice to control for multiple comparisons
- The statistical significance needs to be interpreted in the context of the domain. For instance, while there might be a significant difference between groups 2 and 3, it is important to understand what these groups represent and the practical implications of this difference

### Number of Votes

In [None]:
create_box_plots(revenue_df, 'num_votes', log_transform=True)

#### Evaluating Assumptions

In [None]:
test_assumptions_and_anova(revenue_df, 'num_votes')

Assumption evaluation for num_votes
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

Normality results (Shapiro-Wilk):
Community 0: p-value = 0.0
Community 1: p-value = 0.0
Community 2: p-value = 0.0
Community 3: p-value = 0.0
Community 4: p-value = 1.5075766142941333e-23

Homogeneity results (Brown-Forsythe): {'statistic': 23.489657438570287, 'pvalue': 2.5256484981790333e-19}

Variance H-Test (Kruskal-Wallis): {'statistic': 285.2470775363061, 'pvalue': 1.6466820320107903e-60}

Welch’s ANOVA:
                          sum_sq      df          F        PR(>F)
C(hard_assignment)  2.476549e+12     4.0  26.348447  1.001803e-21
Residual            1.723347e+14  7334.0        NaN           NaN

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

NOTE: Be cautious about ANOVA results if assumptions (normality and homogeneity) are violated.

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


The results from the assumption tests for the 'num_votes' feature across different communities suggest significant deviations from the normality and homogeneity of variances required for a traditional ANOVA. The Shapiro-Wilk test yields p-values effectively at zero for all communities but one, where it is still significantly low, indicating that the 'num_votes' data are not normally distributed within these groups. Furthermore, the Brown-Forsythe test returns a very low p-value, pointing to non-homogeneous variances across the communities. These violations of ANOVA assumptions necessitate caution in interpreting the Welch's ANOVA results, which, while indicating a significant F-statistic, may not be reliable under these circumstances. On the other hand, the Kruskal-Wallis test, a robust non-parametric alternative, provides a highly significant result. This indicates that there are indeed differences in the distribution of 'num_votes' across the communities. Therefore, it is advisable to depend on the findings of the Kruskal-Wallis test when evaluating disparities in 'num_votes' across communities. For subsequent detailed pairwise comparisons, a non-parametric post hoc analysis, such as Dunn's test, should be employed.

#### Post Hoc Analysis

In [None]:
plot_dunns_test(revenue_df, 'num_votes')

**Observations:** The Dunn's test for num_votes reveals a high p-value (0.299) for the comparison between groups 0 and 1. This p-value is well above the commonly accepted alpha level of 0.05, indicating that there is no statistically significant difference in the number of votes between these two groups. In contrast, the p-values for comparisons involving groups 0-2, 0-3, 0-4, 1-3, 1-4, 2-3, 2-4, and 3-4 are effectively 0.000. This implies a very strong rejection of the null hypothesis, suggesting significant differences in the number of votes between these groups. A notable exception is the comparison between groups 1 and 2, which has a p-value of 0.011, still below the alpha level of 0.05, indicating a significant difference, albeit less pronounced than in the other comparisons.

**Conclusions:** We can conclude with reasonable confidence that there is no significant difference in the median number of votes between groups 0 and 1. For the rest of the pairwise comparisons, with the exception of 1-2, the tests suggest robust significant differences in the number of votes. These findings suggest that certain groups have a markedly different number of votes when compared to others. The comparison between groups 1 and 2, while statistically significant, shows a higher p-value than the others, which could imply a smaller magnitude of difference.

**Considerations:** Further statistical examination, such as the calculation of effect sizes, could offer more detail on the magnitude of the observed differences, adding depth to the purely statistical significance reported by the p-values.

### Average Rating

In [None]:
create_box_plots(revenue_df, 'avg_rating')

#### Evaluating Assumptions

In [None]:
test_assumptions_and_anova(revenue_df, 'avg_rating')

Assumption evaluation for avg_rating
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

Normality results (Shapiro-Wilk):
Community 0: p-value = 2.026485548886654e-20
Community 1: p-value = 8.73140122207747e-24
Community 2: p-value = 3.5397899889044737e-12
Community 3: p-value = 4.0590752981017886e-10
Community 4: p-value = 4.7671769824860974e-14

Homogeneity results (Brown-Forsythe): {'statistic': 17.14014532424138, 'pvalue': 5.309114638109637e-14}

Variance H-Test (Kruskal-Wallis): {'statistic': 379.6413520039993, 'pvalue': 6.959103429524262e-81}

Welch’s ANOVA:
                         sum_sq      df          F        PR(>F)
C(hard_assignment)   365.589074     4.0  84.963249  1.185607e-70
Residual            7889.382437  7334.0        NaN           NaN

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

NOTE: Be cautious about ANOVA results if assumptions (normality and homogeneity) are violated.

- - - - - - - - - - - - - - - - - - - - -

#### Post Hoc Analysis

In [None]:
plot_dunns_test(revenue_df, 'avg_rating')

### Financial Analysis
Test 1: movies with higher budgets generate higher revenue

Test 2: there is a significant difference in average budget between movies with high rating and those with a low rating

Test 3: movies with higher revenue are more likely to achieve higher ratings