## Make a cleaner DataFrame with as much detail as useful

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
# The Before version, takes several secs to load (300+ MB)
df = pd.read_csv('bigframe.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,checkin_id,beer.bid,user.uid,rating_score,brewery.brewery_name,beer.beer_name,beer.beer_style,brewery.brewery_id,brewery.brewery_type,...,venue.location.venue_country,venue.location.venue_state,venue.primary_category,venue.venue_id,checkin_comment,created_at,venue.type,beer.rating_score,beer.beer_description,abv
0,0,821797539,2095023,3340203,3.75,Stone Brewing,Stone Scorpion Bowl IPA,IPA - American,1204,Regional Brewery,...,,,,,"Easy-drinking, not too hoppy. Solid.","Sat, 26 Oct 2019 03:52:50 +0000",[],3.73789,To create a recipe so tropical and fruity with...,7.5
1,219,818949121,1709568,3340203,3.5,Ritual Brewing Company,Pale Ale,Pale Ale - American,39329,Micro Brewery,...,United States,CA,Nightlife Spot,376422.0,Solid.,"Sat, 19 Oct 2019 03:07:43 +0000",['brewery'],3.43165,,5.2
2,221,818856642,2734572,3340203,4.25,Ritual Brewing Company,Oil Rig,IPA - Imperial / Double,39329,Micro Brewery,...,United States,CA,Nightlife Spot,376422.0,Gooood stuff. Cool place too.,"Sat, 19 Oct 2019 00:37:37 +0000",['brewery'],3.87873,NEW TRADITIONS CALL FOR NEW HOLIDAYS AND NEW H...,9.0
3,226,815159720,1044097,3340203,4.25,Stone Brewing,Stone Ruination Double IPA 2.0,IPA - Imperial / Double,1204,Regional Brewery,...,,,,,Probably the best Stone beer I’ve had. I’m a fan.,"Thu, 10 Oct 2019 02:20:55 +0000",[],4.0129,Stone Ruination IPA was the first full-time br...,8.5
4,331,814916483,1070,3340203,3.75,Lagunitas Brewing Company,Imperial Stout,Stout - Russian Imperial,765,Macro Brewery,...,,,,,"A really nice, if unspectacular, stout.","Wed, 09 Oct 2019 04:25:53 +0000",[],3.9142,"Made with Highly roasted malted barley, and pl...",9.9


In [3]:
df.shape

(1571140, 28)

In [4]:
df.columns

Index(['Unnamed: 0', 'checkin_id', 'beer.bid', 'user.uid', 'rating_score',
       'brewery.brewery_name', 'beer.beer_name', 'beer.beer_style',
       'brewery.brewery_id', 'brewery.brewery_type', 'brewery.country_name',
       'brewery.location.brewery_city', 'brewery.location.brewery_state',
       'brewery.location.lat', 'brewery.location.lng', 'venue.location.lat',
       'venue.location.lng', 'venue.location.venue_city',
       'venue.location.venue_country', 'venue.location.venue_state',
       'venue.primary_category', 'venue.venue_id', 'checkin_comment',
       'created_at', 'venue.type', 'beer.rating_score',
       'beer.beer_description', 'abv'],
      dtype='object')

In [5]:
# get rid of that first, useless column
df = df[df.columns[1:]]
# and rename a bunch of them
df.rename(columns={'beer.bid':'beer_id', 'user.uid':'user_id', 'rating_score':'rating_user',
       'brewery.brewery_name':'brewery_name', 'beer.beer_name':'beer_name', 
       'beer.beer_style':'beer_style', 'brewery.brewery_id':'brewery_id',
       'brewery.brewery_type':'brewery_type', 'brewery.country_name':'brewery_country',
       'brewery.location.brewery_city':'brewery_city',
       'brewery.location.brewery_state':'brewery_state',
       'brewery.location.lat':'brewery_lat', 'brewery.location.lng':'brewery_lon',
       'venue.location.lat':'venue_lat', 'venue.location.lng':'venue_lon',
       'venue.location.venue_city':'venue_city',
       'venue.location.venue_country':'venue_country',
       'venue.location.venue_state':'venue_state',
       'venue.primary_category':'venue_cat', 'venue.venue_id':'venue_id',
       'beer.rating_score':'rating_global',
       'beer.beer_description':'beer_description'},
        inplace=True)

In [6]:
df.columns

Index(['checkin_id', 'beer_id', 'user_id', 'rating_user', 'brewery_name',
       'beer_name', 'beer_style', 'brewery_id', 'brewery_type',
       'brewery_country', 'brewery_city', 'brewery_state', 'brewery_lat',
       'brewery_lon', 'venue_lat', 'venue_lon', 'venue_city', 'venue_country',
       'venue_state', 'venue_cat', 'venue_id', 'checkin_comment', 'created_at',
       'venue.type', 'rating_global', 'beer_description', 'abv'],
      dtype='object')

In [7]:
# missed one dot
df.rename(columns={'venue.type':'venue_type'},
        inplace=True)
df.columns

Index(['checkin_id', 'beer_id', 'user_id', 'rating_user', 'brewery_name',
       'beer_name', 'beer_style', 'brewery_id', 'brewery_type',
       'brewery_country', 'brewery_city', 'brewery_state', 'brewery_lat',
       'brewery_lon', 'venue_lat', 'venue_lon', 'venue_city', 'venue_country',
       'venue_state', 'venue_cat', 'venue_id', 'checkin_comment', 'created_at',
       'venue_type', 'rating_global', 'beer_description', 'abv'],
      dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1571140 entries, 0 to 1571139
Data columns (total 27 columns):
checkin_id          1571140 non-null int64
beer_id             1571140 non-null int64
user_id             1571140 non-null int64
rating_user         1571140 non-null float64
brewery_name        1571140 non-null object
beer_name           1571140 non-null object
beer_style          1571140 non-null object
brewery_id          1571140 non-null int64
brewery_type        1571140 non-null object
brewery_country     1571140 non-null object
brewery_city        1554436 non-null object
brewery_state       1546325 non-null object
brewery_lat         1571140 non-null float64
brewery_lon         1571140 non-null float64
venue_lat           1101509 non-null float64
venue_lon           1101509 non-null float64
venue_city          1052604 non-null object
venue_country       1101367 non-null object
venue_state         1087951 non-null object
venue_cat           1092671 non-null object
venue_

In [9]:
df.head()

Unnamed: 0,checkin_id,beer_id,user_id,rating_user,brewery_name,beer_name,beer_style,brewery_id,brewery_type,brewery_country,...,venue_country,venue_state,venue_cat,venue_id,checkin_comment,created_at,venue_type,rating_global,beer_description,abv
0,821797539,2095023,3340203,3.75,Stone Brewing,Stone Scorpion Bowl IPA,IPA - American,1204,Regional Brewery,United States,...,,,,,"Easy-drinking, not too hoppy. Solid.","Sat, 26 Oct 2019 03:52:50 +0000",[],3.73789,To create a recipe so tropical and fruity with...,7.5
1,818949121,1709568,3340203,3.5,Ritual Brewing Company,Pale Ale,Pale Ale - American,39329,Micro Brewery,United States,...,United States,CA,Nightlife Spot,376422.0,Solid.,"Sat, 19 Oct 2019 03:07:43 +0000",['brewery'],3.43165,,5.2
2,818856642,2734572,3340203,4.25,Ritual Brewing Company,Oil Rig,IPA - Imperial / Double,39329,Micro Brewery,United States,...,United States,CA,Nightlife Spot,376422.0,Gooood stuff. Cool place too.,"Sat, 19 Oct 2019 00:37:37 +0000",['brewery'],3.87873,NEW TRADITIONS CALL FOR NEW HOLIDAYS AND NEW H...,9.0
3,815159720,1044097,3340203,4.25,Stone Brewing,Stone Ruination Double IPA 2.0,IPA - Imperial / Double,1204,Regional Brewery,United States,...,,,,,Probably the best Stone beer I’ve had. I’m a fan.,"Thu, 10 Oct 2019 02:20:55 +0000",[],4.0129,Stone Ruination IPA was the first full-time br...,8.5
4,814916483,1070,3340203,3.75,Lagunitas Brewing Company,Imperial Stout,Stout - Russian Imperial,765,Macro Brewery,United States,...,,,,,"A really nice, if unspectacular, stout.","Wed, 09 Oct 2019 04:25:53 +0000",[],3.9142,"Made with Highly roasted malted barley, and pl...",9.9


In [2]:
bdf = pd.read_csv('bigBeerFrame.csv')

In [3]:
bdf.columns

Index(['Unnamed: 0', 'checkin_id', 'date', 'checkin_comment', 'rating_user',
       'user_id', 'venue_lat', 'venue_lon', 'venue_city', 'venue_state',
       'venue_country', 'venue_cat', 'venue_type', 'venue_id', 'beer_id',
       'beer_name', 'abv', 'description', 'beer_style', 'rating_global',
       'brewery_id', 'brewery_name', 'brewery_type', 'brewery_country',
       'brewery_city', 'brewery_state', 'brewery_lat', 'brewery_lon'],
      dtype='object')

In [4]:
bdf = bdf[bdf.columns[1:]]

In [6]:
print(bdf.date[0], type(bdf.date[0]))

Sat, 21 Dec 2019 05:52:07 +0000 <class 'str'>


In [8]:
# convert those stringy dates to datetimes
# This took a few minutes
date = pd.to_datetime(bdf['date'])

In [9]:
bdf['date'] = date
bdf.date[:11]

0    2019-12-21 05:52:07+00:00
1    2019-12-20 04:26:42+00:00
2    2019-12-20 02:52:06+00:00
3    2019-12-03 21:53:09+00:00
4    2019-12-01 21:53:24+00:00
5    2019-11-25 04:32:51+00:00
6    2019-11-09 20:51:55+00:00
7    2019-09-27 19:37:58+00:00
8    2019-07-17 22:56:21+00:00
9    2019-06-29 16:16:31+00:00
10   2019-05-18 02:19:08+00:00
Name: date, dtype: datetime64[ns, UTC]

## Remove the checkins with no user rating, for the purposes of this project.

In [10]:
df = df[df.rating_user > 0]
df.shape

(1425447, 27)

In [11]:
print(f'User ratings range from {min(df.rating_user)} to {max(df.rating_user)}')

User ratings range from 0.25 to 5.0


In [10]:
print(bdf.shape)
bdf = bdf[bdf.rating_user > 0]
print(bdf.shape)

(692291, 27)
(636518, 27)


### See what the brewery columns look like

In [12]:
df.loc[:30, 'brewery_id':'brewery_lon']

Unnamed: 0,brewery_id,brewery_type,brewery_country,brewery_city,brewery_state,brewery_lat,brewery_lon
0,1204,Regional Brewery,United States,Escondido,CA,33.1157,-117.12
1,39329,Micro Brewery,United States,Redlands,CA,34.0708,-117.237
2,39329,Micro Brewery,United States,Redlands,CA,34.0708,-117.237
3,1204,Regional Brewery,United States,Escondido,CA,33.1157,-117.12
4,765,Macro Brewery,United States,Petaluma,CA,38.2724,-122.662
5,1534,Regional Brewery,United States,Carlsbad,CA,33.1596,-117.348
6,68356,Regional Brewery,United States,San Diego,CA,32.7542,-117.206
7,908,Regional Brewery,United States,Holland,MI,42.8182,-86.1143
8,1620,Cidery,United States,Julian,CA,33.0978,-116.647
9,1620,Cidery,United States,Julian,CA,33.0978,-116.647


### Check the venue columns

In [13]:
df.loc[:30, 'venue_lat':'venue_id']

Unnamed: 0,venue_lat,venue_lon,venue_city,venue_country,venue_state,venue_cat,venue_id
0,,,,,,,
1,34.0707,-117.237,Redlands,United States,CA,Nightlife Spot,376422.0
2,34.0707,-117.237,Redlands,United States,CA,Nightlife Spot,376422.0
3,,,,,,,
4,,,,,,,
5,,,,,,,
6,34.0549,-116.965,,United States,CA,Food,4177151.0
7,,,,,,,
8,,,,,,,
9,,,,,,,


## And the rest of the columns

In [14]:
df.loc[:30, 'checkin_comment':]

Unnamed: 0,checkin_comment,created_at,venue_type,rating_global,beer_description,abv
0,"Easy-drinking, not too hoppy. Solid.","Sat, 26 Oct 2019 03:52:50 +0000",[],3.73789,To create a recipe so tropical and fruity with...,7.5
1,Solid.,"Sat, 19 Oct 2019 03:07:43 +0000",['brewery'],3.43165,,5.2
2,Gooood stuff. Cool place too.,"Sat, 19 Oct 2019 00:37:37 +0000",['brewery'],3.87873,NEW TRADITIONS CALL FOR NEW HOLIDAYS AND NEW H...,9.0
3,Probably the best Stone beer I’ve had. I’m a fan.,"Thu, 10 Oct 2019 02:20:55 +0000",[],4.0129,Stone Ruination IPA was the first full-time br...,8.5
4,"A really nice, if unspectacular, stout.","Wed, 09 Oct 2019 04:25:53 +0000",[],3.9142,"Made with Highly roasted malted barley, and pl...",9.9
5,Can’t believe it took me this long to try this...,"Sat, 05 Oct 2019 00:54:32 +0000",[],3.84851,Note: Swami's IPA is brewed and distributed fr...,6.8
6,Damn good.,"Sat, 14 Sep 2019 22:48:11 +0000",['steakhouse'],4.00465,This deeply juicy stunner in the mold of City ...,6.7
7,MMMmmmmMmmmmmm.,"Thu, 05 Sep 2019 02:44:18 +0000",[],4.0611,A stout with roasty malt character intermingle...,11.0
8,"A nice, easy-drinking cider.","Mon, 02 Sep 2019 04:47:27 +0000",[],3.36631,Julian Harvest Apple Cider is lightly carbonat...,6.99
9,Also bomb. Delayed check-in here.,"Mon, 02 Sep 2019 04:44:11 +0000",[],3.67697,Refreshing raspberry cider. Tart and fruity,6.99


In [15]:
# What percent of abv's are listed?
print(f'Percent of beers whose a.b.v. is listed: {sum(df.abv > 0) * 100 / df.shape[0]}')
print()
# What percent of beers have a description?
print(f'Percent of beers with a description: {100 - sum(df.beer_description.isnull()) * 100 / df.shape[0]}')
print()
# What percent have a global rating?
print(f'Percent of beers with a global rating: {sum(df.rating_global > 0) * 100 / df.shape[0]}')

Percent of beers whose a.b.v. is listed: 98.89227729968214

Percent of beers with a description: 74.22043751889758

Percent of beers with a global rating: 84.87681408007452


In [16]:
df.venue_type[:15]  # empty lists for NaN, and lists of varying length for the others

0                                                    []
1                                           ['brewery']
2                                           ['brewery']
3                                                    []
4                                                    []
5                                                    []
6                                        ['steakhouse']
7                                                    []
8                                                    []
9                                                    []
10                                                   []
11                                      ['zoo', 'park']
12    ['sandwich_place', 'salad_place', 'shopping_ma...
13                                                   []
14                                                   []
Name: venue_type, dtype: object

## Last column is the checkin timestamps, which appear to be strings still

In [17]:
print(df.created_at[0], type(df.created_at[0]))

Sat, 26 Oct 2019 03:52:50 +0000 <class 'str'>


In [18]:
# This took several minutes
date = pd.to_datetime(df['created_at'])

df['date'] = date

df.date[:20]

In [20]:
df.drop(columns=['created_at'], axis=1, inplace=True)
df.head()

Unnamed: 0,checkin_id,beer_id,user_id,rating_user,brewery_name,beer_name,beer_style,brewery_id,brewery_type,brewery_country,...,venue_country,venue_state,venue_cat,venue_id,checkin_comment,venue_type,rating_global,beer_description,abv,date
0,821797539,2095023,3340203,3.75,Stone Brewing,Stone Scorpion Bowl IPA,IPA - American,1204,Regional Brewery,United States,...,,,,,"Easy-drinking, not too hoppy. Solid.",[],3.73789,To create a recipe so tropical and fruity with...,7.5,2019-10-26 03:52:50+00:00
1,818949121,1709568,3340203,3.5,Ritual Brewing Company,Pale Ale,Pale Ale - American,39329,Micro Brewery,United States,...,United States,CA,Nightlife Spot,376422.0,Solid.,['brewery'],3.43165,,5.2,2019-10-19 03:07:43+00:00
2,818856642,2734572,3340203,4.25,Ritual Brewing Company,Oil Rig,IPA - Imperial / Double,39329,Micro Brewery,United States,...,United States,CA,Nightlife Spot,376422.0,Gooood stuff. Cool place too.,['brewery'],3.87873,NEW TRADITIONS CALL FOR NEW HOLIDAYS AND NEW H...,9.0,2019-10-19 00:37:37+00:00
3,815159720,1044097,3340203,4.25,Stone Brewing,Stone Ruination Double IPA 2.0,IPA - Imperial / Double,1204,Regional Brewery,United States,...,,,,,Probably the best Stone beer I’ve had. I’m a fan.,[],4.0129,Stone Ruination IPA was the first full-time br...,8.5,2019-10-10 02:20:55+00:00
4,814916483,1070,3340203,3.75,Lagunitas Brewing Company,Imperial Stout,Stout - Russian Imperial,765,Macro Brewery,United States,...,,,,,"A really nice, if unspectacular, stout.",[],3.9142,"Made with Highly roasted malted barley, and pl...",9.9,2019-10-09 04:25:53+00:00


In [21]:
df.columns

Index(['checkin_id', 'beer_id', 'user_id', 'rating_user', 'brewery_name',
       'beer_name', 'beer_style', 'brewery_id', 'brewery_type',
       'brewery_country', 'brewery_city', 'brewery_state', 'brewery_lat',
       'brewery_lon', 'venue_lat', 'venue_lon', 'venue_city', 'venue_country',
       'venue_state', 'venue_cat', 'venue_id', 'checkin_comment', 'venue_type',
       'rating_global', 'beer_description', 'abv', 'date'],
      dtype='object')

In [22]:
# Should've added the index=False argument here, to keep datetimes and venue lists from being converted to strings
df.to_csv('bigframe.csv')

In [20]:
# make same column order for bdf  (can't just use df.columns since i don't want to reload df)
bdf = bdf[['checkin_id', 'beer_id', 'user_id', 'rating_user', 'brewery_name',
       'beer_name', 'beer_style', 'brewery_id', 'brewery_type',
       'brewery_country', 'brewery_city', 'brewery_state', 'brewery_lat',
       'brewery_lon', 'venue_lat', 'venue_lon', 'venue_city', 'venue_country',
       'venue_state', 'venue_cat', 'venue_id', 'checkin_comment', 'venue_type',
       'rating_global', 'description', 'abv', 'date']]

In [21]:
bdf.head()

Unnamed: 0,checkin_id,beer_id,user_id,rating_user,brewery_name,beer_name,beer_style,brewery_id,brewery_type,brewery_country,...,venue_country,venue_state,venue_cat,venue_id,checkin_comment,venue_type,rating_global,description,abv,date
0,843971771,526725,1373368,3.75,Wicks Brewing,Battle Cry IPA,IPA - American,44236,Brew Pub,United States,...,United States,CA,[],3966054.0,,Nightlife Spot,3.63385,,6.8,2019-12-21 05:52:07+00:00
1,843407973,526725,3484155,3.5,Wicks Brewing,Battle Cry IPA,IPA - American,44236,Brew Pub,United States,...,United States,CA,[],9586831.0,Brewed for the House.,Arts & Entertainment,3.63385,,6.8,2019-12-20 04:26:42+00:00
2,843380508,526725,342359,4.0,Wicks Brewing,Battle Cry IPA,IPA - American,44236,Brew Pub,United States,...,United States,CA,[],3966054.0,,Nightlife Spot,3.63385,,6.8,2019-12-20 02:52:06+00:00
3,837180407,526725,6700685,4.5,Wicks Brewing,Battle Cry IPA,IPA - American,44236,Brew Pub,United States,...,,,[],,,,3.63385,,6.8,2019-12-03 21:53:09+00:00
4,836736462,526725,3094250,3.5,Wicks Brewing,Battle Cry IPA,IPA - American,44236,Brew Pub,United States,...,,,[],,,,3.63385,,6.8,2019-12-01 21:53:24+00:00


In [22]:
bdf.to_csv('bigBeerFrame.csv', index=False)