In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats
from scipy.stats import mannwhitneyu

In [3]:
#needs changing but US ratings of beers with user_state and beer_state
# Loading of ratings that only contain u
BA_merged = pd.read_csv('../../minimizedData/BA_ratings_small_plus.csv')
US_ratings = BA_merged.dropna(subset=['beer_location','user_location','rating'])
US_ratings = US_ratings[US_ratings['user_location'].str.startswith('United States,')]
US_ratings = US_ratings[US_ratings['beer_location'].str.startswith('United States,')]

US_ratings['user_state'] = US_ratings['user_location'].apply(lambda x: x.split(',')[1].strip())
US_ratings['beer_state'] = US_ratings['beer_location'].apply(lambda x: x.split(',')[1].strip())
US_ratings = US_ratings.drop(['user_location','beer_location'], axis=1)

US_ratings = US_ratings[(US_ratings['user_state'] != 'United States') & (US_ratings['beer_state'] != 'United States')]

US_ratings.head(3)

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,user_id,appearance,aroma,palate,taste,overall,rating,avg,user_state,beer_state
1008809,Kupfer Kolsch,289320.0,Copper State Brewing Company,49595.0,Kölsch,n2185.211743,2.5,4.0,4.0,3.75,3.75,3.76,3.76,North Carolina,Wisconsin
1008810,Northwestern Alt,289321.0,Copper State Brewing Company,49595.0,Altbier,n2185.211743,3.0,3.75,4.0,3.5,3.5,3.58,3.58,North Carolina,Wisconsin
1008811,One Cent Wheat,289319.0,Copper State Brewing Company,49595.0,Witbier,n2185.211743,3.75,3.25,3.75,3.5,3.5,3.48,3.48,North Carolina,Wisconsin


In [4]:
#csv file containing all the states and their neighbouring states
#empty list for states that have no neighbours (e.g Alazka, Hawaii)
neighbours_df = pd.read_csv('../../additionalData/bordering_states.csv', dtype={'state':'string', 
                                                                          'neighbours': 'string'})
neighbours_df["neighbours"] = neighbours_df["neighbours"].fillna("").apply(lambda x: x.split(";") if x else [])

neighbours_df.head(3)

Unnamed: 0,state,neighbours,nb_neighbours
0,Alabama,"[Florida, Georgia, Mississippi, Tennessee]",4
1,Alaska,[],0
2,Arizona,"[California, Colorado, Nevada, New Mexico, Utah]",5


In [5]:
def gather_region_ratings(state, US_ratings, neighbours_df):
    # Get the neighbors for the given state
    neighbours = neighbours_df.loc[neighbours_df['state'] == state, 'neighbours'].values[0]
    
    region_states = [state] + neighbours    
    region_ratings = US_ratings[US_ratings['beer_state'].isin(region_states)]

    in_region_ratings = region_ratings[region_ratings['user_state'].isin(region_states)]['rating'].tolist()
    not_in_region_ratings = region_ratings[~region_ratings['user_state'].isin(region_states)]['rating'].tolist()
    
    return in_region_ratings, not_in_region_ratings

# List of all states
states = US_ratings['beer_state'].unique()

# Initialize the combined dictionary
all_ratings = {'region': [], 'rating': [], 'rating_type': []}

# Loop through each state and gather ratings
for state in states:
    # Gather ratings
    in_region, non_region = gather_region_ratings(state, US_ratings, neighbours_df)

    # Region name
    region_name = f"region_{state}"
    
    # Extend the dictionary for In-Region
    all_ratings['region'].extend([region_name] * len(in_region))
    all_ratings['rating'].extend(in_region)
    all_ratings['rating_type'].extend(['In-Region'] * len(in_region))
    
    # Extend the dictionary for Non-Region
    all_ratings['region'].extend([region_name] * len(non_region))
    all_ratings['rating'].extend(non_region)
    all_ratings['rating_type'].extend(['Non-Region'] * len(non_region))

# Convert results to a DataFrame suitable for plotting
ratings_df = pd.DataFrame(all_ratings)

In [6]:
def gather_region_ratings(state, US_ratings, neighbours_df):
    neighbours = neighbours_df.loc[neighbours_df['state'] == state, 'neighbours'].values[0]
    
    region_states = [state] + neighbours    
    region_ratings = US_ratings[US_ratings['beer_state'].isin(region_states)]

    in_region_ratings = region_ratings[region_ratings['user_state'].isin(region_states)]
    not_in_region_ratings = region_ratings[~region_ratings['user_state'].isin(region_states)]
    
    return in_region_ratings, not_in_region_ratings

states = US_ratings['beer_state'].unique()

all_ratings = {'region': [], 'rating': [], 'rating_type': [], 'user_state': []}

# Loop through each state and gather the ratings
for state in states:
    in_region, non_region = gather_region_ratings(state, US_ratings, neighbours_df)

    region_name = f"region_{state}"
    
    # Add In-Region ratings to the dictionary
    all_ratings['region'].extend([region_name] * len(in_region))
    all_ratings['rating'].extend(in_region['rating'].tolist())
    all_ratings['rating_type'].extend(['In-Region'] * len(in_region))
    all_ratings['user_state'].extend(in_region['user_state'].tolist())
    
    # Add Non-Region ratings to the dictionary
    all_ratings['region'].extend([region_name] * len(non_region))
    all_ratings['rating'].extend(non_region['rating'].tolist())
    all_ratings['rating_type'].extend(['Non-Region'] * len(non_region))
    all_ratings['user_state'].extend(non_region['user_state'].tolist())

ratings_df = pd.DataFrame(all_ratings)

ratings_df.head()


Unnamed: 0,region,rating,rating_type,user_state
0,region_Wisconsin,4.04,In-Region,Wisconsin
1,region_Wisconsin,4.0,In-Region,Wisconsin
2,region_Wisconsin,3.75,In-Region,Wisconsin
3,region_Wisconsin,3.9,In-Region,Illinois
4,region_Wisconsin,3.25,In-Region,Wisconsin


In [7]:
for state in states:
    neighbours = neighbours_df.loc[neighbours_df['state'] == state, 'neighbours'].values[0]
    region_states = [state] + neighbours    
    region_ratings = US_ratings[US_ratings['beer_state'].isin(region_states)]

    print(state)
    print(region_ratings.head())

Wisconsin
                      beer_name   beer_id                  brewery_name  \
1008809           Kupfer Kolsch  289320.0  Copper State Brewing Company   
1008810        Northwestern Alt  289321.0  Copper State Brewing Company   
1008811          One Cent Wheat  289319.0  Copper State Brewing Company   
1008812  Platinum Coffee Blonde  289324.0  Copper State Brewing Company   
1008813  Platinum Coffee Blonde  289324.0  Copper State Brewing Company   

         brewery_id                style            user_id  appearance  \
1008809     49595.0               Kölsch       n2185.211743        2.50   
1008810     49595.0              Altbier       n2185.211743        3.00   
1008811     49595.0              Witbier       n2185.211743        3.75   
1008812     49595.0  American Blonde Ale       n2185.211743        3.25   
1008813     49595.0  American Blonde Ale  angerhaus.1049226        4.00   

         aroma  palate  taste  overall  rating   avg      user_state  \
1008809   4.00  

In [8]:
rat = ratings_df['rating_type'].unique()
print(rat)

['In-Region' 'Non-Region']


In [9]:
ratings_df.head(200)

Unnamed: 0,region,rating,rating_type,user_state
0,region_Wisconsin,4.04,In-Region,Wisconsin
1,region_Wisconsin,4.00,In-Region,Wisconsin
2,region_Wisconsin,3.75,In-Region,Wisconsin
3,region_Wisconsin,3.90,In-Region,Illinois
4,region_Wisconsin,3.25,In-Region,Wisconsin
...,...,...,...,...
195,region_Wisconsin,3.50,In-Region,Minnesota
196,region_Wisconsin,4.25,In-Region,Wisconsin
197,region_Wisconsin,3.75,In-Region,Minnesota
198,region_Wisconsin,4.15,In-Region,Minnesota


In [10]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34586648 entries, 0 to 34586647
Data columns (total 4 columns):
 #   Column       Dtype  
---  ------       -----  
 0   region       object 
 1   rating       float64
 2   rating_type  object 
 3   user_state   object 
dtypes: float64(1), object(3)
memory usage: 1.0+ GB


In [11]:
ratings_df = ratings_df[ratings_df['rating_type'] == 'In-Region']

ratings_df.head()

Unnamed: 0,region,rating,rating_type,user_state
0,region_Wisconsin,4.04,In-Region,Wisconsin
1,region_Wisconsin,4.0,In-Region,Wisconsin
2,region_Wisconsin,3.75,In-Region,Wisconsin
3,region_Wisconsin,3.9,In-Region,Illinois
4,region_Wisconsin,3.25,In-Region,Wisconsin


In [None]:

cohen_df = cohen_df.sort_values(by='Cohen_d')
plt.figure(figsize=(12, 8))
sns.barplot(
    x='region',  # Use the 'region' column for the x-axis labels
    y='Cohen_d', 
    data=cohen_df, 
    palette='viridis',
    hue='region'
)
plt.axhline(y=0, color='black', linewidth=1)
plt.axhline(y=0.2, color='#FFA07A', linestyle=':', linewidth=2, label='Small effect (d=0.2)')
plt.axhline(y=-0.2, color='#FFA07A', linestyle=':', linewidth=2)
plt.axhline(y=0.5, color='#FF8C00', linestyle=':', linewidth=2, label='Medium effect (d=0.5)')
plt.axhline(y=-0.5, color='#FF8C00', linestyle=':', linewidth=2)
plt.axhline(y=0.8, color='#CD3700', linestyle=':', linewidth=2, label='Large effect (d=0.8)')
plt.axhline(y=-0.8, color='#CD3700', linestyle=':', linewidth=2)
plt.xlabel('State')
plt.ylabel('Cohen’s D')
plt.title('Cohen’s D for In-State vs Out-of-State Ratings by State (Sorted by Cohen’s D)')
plt.xticks(rotation=90)
plt.legend(title='Effect Size Thresholds', loc='upper right')
plt.tight_layout()
plt.show()
