In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats
from scipy.stats import mannwhitneyu

In [3]:
#needs changing but US ratings of beers with user_state and beer_state
# Loading of ratings that only contain u
BA_merged = pd.read_csv('../../minimizedData/BA_ratings_small_plus.csv')
US_ratings = BA_merged.dropna(subset=['beer_location','user_location','rating'])
US_ratings = US_ratings[US_ratings['user_location'].str.startswith('United States,')]
US_ratings = US_ratings[US_ratings['beer_location'].str.startswith('United States,')]

US_ratings['user_state'] = US_ratings['user_location'].apply(lambda x: x.split(',')[1].strip())
US_ratings['beer_state'] = US_ratings['beer_location'].apply(lambda x: x.split(',')[1].strip())
US_ratings = US_ratings.drop(['user_location','beer_location'], axis=1)

US_ratings = US_ratings[(US_ratings['user_state'] != 'United States') & (US_ratings['beer_state'] != 'United States')]

US_ratings.head(3)

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,user_id,appearance,aroma,palate,taste,overall,rating,avg,user_state,beer_state
1008809,Kupfer Kolsch,289320.0,Copper State Brewing Company,49595.0,Kölsch,n2185.211743,2.5,4.0,4.0,3.75,3.75,3.76,3.76,North Carolina,Wisconsin
1008810,Northwestern Alt,289321.0,Copper State Brewing Company,49595.0,Altbier,n2185.211743,3.0,3.75,4.0,3.5,3.5,3.58,3.58,North Carolina,Wisconsin
1008811,One Cent Wheat,289319.0,Copper State Brewing Company,49595.0,Witbier,n2185.211743,3.75,3.25,3.75,3.5,3.5,3.48,3.48,North Carolina,Wisconsin


In [4]:
#csv file containing all the states and their neighbouring states
#empty list for states that have no neighbours (e.g Alazka, Hawaii)
neighbours_df = pd.read_csv('../../additionalData/bordering_states.csv', dtype={'state':'string', 
                                                                          'neighbours': 'string'})
neighbours_df["neighbours"] = neighbours_df["neighbours"].fillna("").apply(lambda x: x.split(";") if x else [])

neighbours_df.head(3)

Unnamed: 0,state,neighbours,nb_neighbours
0,Alabama,"[Florida, Georgia, Mississippi, Tennessee]",4
1,Alaska,[],0
2,Arizona,"[California, Colorado, Nevada, New Mexico, Utah]",5


In [26]:
def gather_region_ratings(state, US_ratings, neighbours_df):
    # Get the neighbors for the given state
    neighbours = neighbours_df.loc[neighbours_df['state'] == state, 'neighbours'].values[0]
    
    region_states = [state] + neighbours    
    region_ratings = US_ratings[US_ratings['beer_state'].isin(region_states)]

    in_region_ratings = region_ratings[region_ratings['user_state'].isin(region_states)]['rating'].tolist()
    not_in_region_ratings = region_ratings[~region_ratings['user_state'].isin(region_states)]['rating'].tolist()
    
    return in_region_ratings, not_in_region_ratings

# List of all states
states = US_ratings['beer_state'].unique()

# Initialize the combined dictionary
all_ratings = {'region': [], 'rating': [], 'rating_type': []}

# Loop through each state and gather ratings
for state in states:
    # Gather ratings
    in_region, non_region = gather_region_ratings(state, US_ratings, neighbours_df)

    # Region name
    region_name = f"region_{state}"
    
    # Extend the dictionary for In-Region
    all_ratings['region'].extend([region_name] * len(in_region))
    all_ratings['rating'].extend(in_region)
    all_ratings['rating_type'].extend(['In-Region'] * len(in_region))
    
    # Extend the dictionary for Non-Region
    all_ratings['region'].extend([region_name] * len(non_region))
    all_ratings['rating'].extend(non_region)
    all_ratings['rating_type'].extend(['Non-Region'] * len(non_region))

# Convert results to a DataFrame suitable for plotting
ratings_df = pd.DataFrame(all_ratings)

In [27]:
rat = ratings_df['rating_type'].unique()
print(rat)

['In-Region' 'Non-Region']


In [28]:
ratings_df.head(200)

Unnamed: 0,region,rating,rating_type
0,region_Wisconsin,4.04,In-Region
1,region_Wisconsin,4.00,In-Region
2,region_Wisconsin,3.75,In-Region
3,region_Wisconsin,3.90,In-Region
4,region_Wisconsin,3.25,In-Region
...,...,...,...
195,region_Wisconsin,3.50,In-Region
196,region_Wisconsin,4.25,In-Region
197,region_Wisconsin,3.75,In-Region
198,region_Wisconsin,4.15,In-Region
