In [1]:
import pandas as pd

In [2]:
BA_US_knn_text = pd.read_csv('knnData/BA_US_knn_text.csv')
BA_US_knn_text.head(2)

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,abv,user_id,appearance,aroma,palate,taste,overall,rating,text,avg,user_state,beer_state
0,Kupfer Kolsch,289320.0,Copper State Brewing Company,49595.0,Kölsch,4.4,n2185.211743,2.5,4.0,4.0,3.75,3.75,3.76,,3.76,North Carolina,Wisconsin
1,Northwestern Alt,289321.0,Copper State Brewing Company,49595.0,Altbier,4.6,n2185.211743,3.0,3.75,4.0,3.5,3.5,3.58,,3.58,North Carolina,Wisconsin


In [84]:
df_reduced = BA_US_knn_text[['style', 'appearance', 'aroma', 'palate',
                             'taste', 'overall', 'avg', 'user_state']]
df_reduced.head(2)

Unnamed: 0,style,appearance,aroma,palate,taste,overall,avg,user_state
0,Kölsch,2.5,4.0,4.0,3.75,3.75,3.76,North Carolina
1,Altbier,3.0,3.75,4.0,3.5,3.5,3.58,North Carolina


In [85]:
states = df_reduced['user_state'].unique().tolist()
states.sort()
states

['Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'Florida',
 'Georgia',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming']

In [86]:
#states = ['Hawaii', 'Ontario']

In [87]:
styles = df_reduced['style'].unique().tolist()
styles.sort()
styles   #103 styles

['Altbier',
 'American Adjunct Lager',
 'American Amber / Red Ale',
 'American Amber / Red Lager',
 'American Barleywine',
 'American Black Ale',
 'American Blonde Ale',
 'American Brown Ale',
 'American Dark Wheat Ale',
 'American Double / Imperial IPA',
 'American Double / Imperial Pilsner',
 'American Double / Imperial Stout',
 'American IPA',
 'American Malt Liquor',
 'American Pale Ale (APA)',
 'American Pale Lager',
 'American Pale Wheat Ale',
 'American Porter',
 'American Stout',
 'American Strong Ale',
 'American Wild Ale',
 'Baltic Porter',
 'Belgian Dark Ale',
 'Belgian IPA',
 'Belgian Pale Ale',
 'Belgian Strong Dark Ale',
 'Belgian Strong Pale Ale',
 'Berliner Weissbier',
 'Bière de Champagne / Bière Brut',
 'Bière de Garde',
 'Black & Tan',
 'Bock',
 'Braggot',
 'California Common / Steam Beer',
 'Chile Beer',
 'Cream Ale',
 'Czech Pilsener',
 'Doppelbock',
 'Dortmunder / Export Lager',
 'Dubbel',
 'Dunkelweizen',
 'Eisbock',
 'English Barleywine',
 'English Bitter',
 'En

In [88]:
def calc_features(style, df):
    
    avg_appearance = df['appearance'].mean()
    avg_aroma = df['aroma'].mean()
    avg_palate = df['palate'].mean()
    avg_taste = df['taste'].mean()
    avg_overall = df['overall'].mean()
    avg_rating = df['avg'].mean()
    std_rating = df['avg'].std()
    review_count = df.shape[0]
    
    df = pd.DataFrame(columns=['avg_appearance', 'avg_aroma', 'avg_palate', 
                                        'avg_taste', 'avg_overall','avg_rating_per_style',
                                        'std_per_style', 'reviews_per_style'])
    
    df.loc[0] = [avg_appearance,avg_aroma,avg_palate,avg_taste,
                 avg_overall,avg_rating,std_rating,review_count]
    
    df = df.rename(columns=lambda col: style+'_'+col)
    
    return df
    

In [98]:
df_total_clustering = pd.DataFrame()

for state in states:

    df_state = df_reduced[df_reduced['user_state'] == state]
    
    df_clustering = pd.DataFrame(columns=['user_state'])
    df_clustering.loc[0, 'user_state'] = state
    
    for style in styles:
        df_style = df_state[df_state['style'] == style]
        
        df_features = calc_features(style, df_style)
        
        df_clustering = pd.concat([df_clustering, df_features], axis=1)
    
    df_total_clustering = pd.concat([df_total_clustering, df_clustering], ignore_index=True)

    

In [99]:
df_total_clustering

Unnamed: 0,user_state,Altbier_avg_appearance,Altbier_avg_aroma,Altbier_avg_palate,Altbier_avg_taste,Altbier_avg_overall,Altbier_avg_rating_per_style,Altbier_std_per_style,Altbier_reviews_per_style,American Adjunct Lager_avg_appearance,...,Winter Warmer_std_per_style,Winter Warmer_reviews_per_style,Witbier_avg_appearance,Witbier_avg_aroma,Witbier_avg_palate,Witbier_avg_taste,Witbier_avg_overall,Witbier_avg_rating_per_style,Witbier_std_per_style,Witbier_reviews_per_style
0,Alabama,3.911765,3.727941,3.75,3.867647,3.889706,3.662556,0.234964,90.0,2.795,...,0.237897,229.0,3.644426,3.601351,3.529561,3.53125,3.605574,3.558156,0.303586,667.0
1,Alaska,3.76875,3.5375,3.61875,3.625,3.7125,3.672676,0.178445,71.0,2.938406,...,0.227813,116.0,3.810748,3.721963,3.738318,3.778037,3.824766,3.671274,0.269055,157.0
2,Arizona,3.667431,3.490826,3.552752,3.614679,3.62156,3.66801,0.218246,196.0,3.024648,...,0.230187,531.0,3.612069,3.605172,3.587356,3.589655,3.60977,3.578366,0.334341,820.0
3,Arkansas,3.759259,3.685185,3.763889,3.856481,3.833333,3.627432,0.247304,74.0,2.875,...,0.230005,156.0,3.580769,3.680769,3.601923,3.659615,3.728846,3.545768,0.30195,241.0
4,California,3.775442,3.640487,3.692109,3.720133,3.753687,3.689107,0.200937,1333.0,2.834459,...,0.224417,3385.0,3.678754,3.618131,3.629153,3.626198,3.675639,3.63309,0.315669,6330.0
5,Colorado,3.753106,3.572205,3.64441,3.668478,3.704193,3.660814,0.202352,516.0,2.874459,...,0.222892,1215.0,3.6742,3.6156,3.6344,3.6262,3.696,3.653564,0.288359,2082.0
6,Connecticut,3.795732,3.591463,3.703506,3.75,3.838415,3.681097,0.199377,611.0,2.762899,...,0.252691,827.0,3.737758,3.6858,3.678999,3.717084,3.758705,3.67713,0.308135,1878.0
7,Delaware,3.737179,3.551282,3.647436,3.692308,3.660256,3.645309,0.254603,81.0,3.007812,...,0.28847,131.0,3.685315,3.618881,3.575175,3.627622,3.645105,3.67929,0.28692,324.0
8,Florida,3.724242,3.556061,3.627273,3.690909,3.686364,3.666849,0.257904,365.0,2.669333,...,0.232043,1196.0,3.690498,3.627768,3.620541,3.656212,3.673893,3.604985,0.303931,3402.0
9,Georgia,3.80625,3.604687,3.729687,3.704687,3.782812,3.683918,0.206979,291.0,2.859163,...,0.228749,972.0,3.660819,3.633772,3.647661,3.648148,3.719055,3.635126,0.288622,1945.0
