In [1]:
import os
import subprocess
from pathlib import Path

"""
Dynamically find the project root (where .git exists) and set it as the current working directory.
"""
project_root = Path(subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], text=True).strip())
os.chdir(project_root)

In [4]:
import pandas as pd
from src.questions.question2.utils.cohenD import cohen_d
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.questions.question1.utils.prepData import prep_data
from src.questions.question1.utils.basicPlots import plot_cohenD
from src.questions.question1.utils.buildGroups import build_groups

In [None]:
US_ratings = pd.read_csv('data/USData/BA_US_states_all.csv')

#csv file containing all the states and their neighbouring states
#empty list for states that have no neighbours (e.g Alazka, Hawaii)
neighbours_df = pd.read_csv('data/additionalData/bordering_states.csv', dtype={'state':'string', 
                                                                          'neighbours': 'string'})


ratings_df = prep_data(US_ratings, neighbours_df)

final_cohen_df = plot_cohenD(ratings_df, plot=False)
    
state_groups = build_groups(final_cohen_df)
    
state_groups_df = pd.DataFrame(state_groups)



      region  rating user_state rating_type
0  Wisconsin    4.04  Wisconsin    In-State
1  Wisconsin    4.00  Wisconsin    In-State
State Groups:
--------------------------------------------------
States: West Virginia, Ohio, Delaware, New York
--------------------------------------------------
States: Pennsylvania, Maryland, Virginia, Kentucky, New Jersey
--------------------------------------------------
States: South Dakota, Wyoming
--------------------------------------------------
States: Utah, Montana, Washington
--------------------------------------------------
States: Missouri, Iowa, Minnesota, New Mexico, Nevada, Idaho
--------------------------------------------------
States: Kansas, Nebraska
--------------------------------------------------
States: Texas, Oklahoma, Louisiana, Arizona, Oregon, Colorado, California
--------------------------------------------------
States: Illinois, Indiana
--------------------------------------------------
States: Wisconsin, Michigan, North

  final_cohen_df = pd.concat(all_cohen_results, ignore_index=True)


In [8]:
def regions_cohenD_Q1(US_ratings, state_groups_df, plot=True):
    cohen_results_by_region = {}
    
    for index, row in state_groups_df.iterrows():
        
        group = row['States']

        in_region_ratings = US_ratings[US_ratings['user_state'].isin(group)]['rating'] #take only the ratings of states inside the region
        out_of_region_ratings = US_ratings[~US_ratings['user_state'].isin(group)]['rating'] #take only the ratings of states outside the region #~ is not

        if len(in_region_ratings) < 2 or len(out_of_region_ratings) < 2: #in order to do correctly the cohen test
            print("Warning nan")
            cohen_results_by_region[index] = np.nan
            continue

        d_value = cohen_d(in_region_ratings, out_of_region_ratings)
        cohen_results_by_region[index] = d_value

    #table to see the Cohen factor for each region
    cohen_by_region_df = pd.DataFrame.from_dict(cohen_results_by_region, orient='index', columns=['Cohen_d'])
    cohen_by_region_df.index.name = 'Group name'
    cohen_by_region_df = cohen_by_region_df.reset_index()
    cohen_by_region_df = cohen_by_region_df.sort_values(by='Cohen_d', ascending=False)

    if plot:
        plt.figure(figsize=(14, 8))
        sns.barplot(data=cohen_by_region_df, x='Group name', y='Cohen_d', palette='viridis')
        plt.title("Cohen's D for in-region ratings compared to out-of-region ratings for the region's users")
        plt.xlabel("Center State of each Region")
        plt.ylabel("Cohen's D value")
        plt.xticks(rotation=90)
        
        plt.axhline(y=0, color='black', linewidth=1)

        plt.axhline(y=0.2, color='#FFA07A', linestyle=':', linewidth=2, label='Small effect (d=0.2)')
        plt.axhline(y=-0.2, color='#FFA07A', linestyle=':', linewidth=2)
        plt.axhline(y=0.5, color='#FF8C00', linestyle=':', linewidth=2, label='Medium effect (d=0.5)')
        plt.axhline(y=-0.5, color='#FF8C00', linestyle=':', linewidth=2)

        plt.tight_layout()
        plt.show()
    
    print(state_groups_df.head())
    return cohen_by_region_df



In [89]:
import plotly.express as px
import pandas as pd

def regions_cohenD_Q1_plotly(US_ratings, state_groups_df, plot=True):
    cohen_results_by_region = {}
    group_states = {}

    for index, row in state_groups_df.iterrows():
        group = row['States']
        
        # Store states in the group for hover info
        group_states[index] = ", ".join(group)

        # Get in-region and out-of-region ratings
        in_region_ratings = US_ratings[US_ratings['user_state'].isin(group)]['rating']
        out_of_region_ratings = US_ratings[~US_ratings['user_state'].isin(group)]['rating']

        if len(in_region_ratings) < 2 or len(out_of_region_ratings) < 2:
            print("Warning: Insufficient data, setting Cohen's d to NaN for group", index)
            cohen_results_by_region[index] = np.nan
            continue

        # Compute Cohen's d
        d_value = cohen_d(in_region_ratings, out_of_region_ratings)
        cohen_results_by_region[index] = d_value

    # Create a DataFrame with Cohen's d results
    cohen_by_region_df = pd.DataFrame.from_dict(cohen_results_by_region, orient='index', columns=['Cohen_d'])
    cohen_by_region_df.index.name = 'Group name'
    cohen_by_region_df = cohen_by_region_df.reset_index()

    # Add hover text with states in each group
    cohen_by_region_df['Group States'] = cohen_by_region_df['Group name'].map(group_states)
    
    cohen_by_region_df_sorted = cohen_by_region_df.sort_values(by='Cohen_d', ascending=True).reset_index(drop=True)
    print(cohen_by_region_df_sorted.head())
    
    if plot:
        # Plot the bar chart with Plotly
        fig = px.bar(
            cohen_by_region_df_sorted,
            x=cohen_by_region_df_sorted.index,
            y="Cohen_d",
            custom_data=["Group States"],
            title="Cohen's D for in-region ratings compared to out-of-region ratings",
        )
        
        # Add reference lines for small and medium effects
        fig.add_hline(y=0, line_dash="solid", line_color="black")
        fig.add_hline(y=0.2, line_dash="dot", line_color="#FFA07A", annotation_text="Small effect (d=0.2)", annotation_position="top left")
        fig.add_hline(y=-0.2, line_dash="dot", line_color="#FFA07A")
        fig.add_hline(y=0.5, line_dash="dot", line_color="#FF8C00", annotation_text="Medium effect (d=0.5)", annotation_position="top left")
        fig.add_hline(y=-0.5, line_dash="dot", line_color="#FF8C00")
        
        
        fig.update_traces(
            hovertemplate="<b>States:</b> %{customdata[0]}<extra></extra>"
        )
        
        fig.update_traces(
            marker=dict(
                color=cohen_by_region_df_sorted['Cohen_d'],  # Use Cohen_d values for coloring
                colorscale='Viridis',  # Viridis color scale
                cmin=-0.2,  # Set minimum color scale value
                cmax=0.2,   # Set maximum color scale value
                colorbar=dict(
                    title="Cohen's D",
                    tickvals=[-0.2, 0, 0.2],  # Set the ticks on the color bar
                    ticktext=["-0.2", "0", "0.2"]  # Set the corresponding text for each tick
                )  
            )
        )

        # Customize layout
        fig.update_layout(
            xaxis_title="Region Group",
            yaxis_title="Cohen's D Value",
            title_font=dict(size=20),
            width=900,
            height=600,
            xaxis=dict(showticklabels=False),
            yaxis=dict(range=[-0.6, 0.6])
        )

        fig.show()

    return cohen_by_region_df


In [90]:
regions_cohenD_Q1_plotly(US_ratings, state_groups_df, plot=True)
state_groups_df

   Group name   Cohen_d                                       Group States
0           2 -0.195341                              South Dakota, Wyoming
1           9 -0.108604  Alabama, Arkansas, South Carolina, Tennessee, ...
2           5 -0.095658                                   Kansas, Nebraska
3           3 -0.070777                          Utah, Montana, Washington
4           1 -0.063232  Pennsylvania, Maryland, Virginia, Kentucky, Ne...


Unnamed: 0,States
0,"[West Virginia, Ohio, Delaware, New York]"
1,"[Pennsylvania, Maryland, Virginia, Kentucky, N..."
2,"[South Dakota, Wyoming]"
3,"[Utah, Montana, Washington]"
4,"[Missouri, Iowa, Minnesota, New Mexico, Nevada..."
5,"[Kansas, Nebraska]"
6,"[Texas, Oklahoma, Louisiana, Arizona, Oregon, ..."
7,"[Illinois, Indiana]"
8,"[Wisconsin, Michigan, North Dakota]"
9,"[Alabama, Arkansas, South Carolina, Tennessee,..."
