### Import Packages

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dask.dataframe as dd
import dask.array as da
import dask.bag as db

### Prep

In [51]:
#read in tables
countiesdf = pd.read_csv('./data/county.csv', index_col = 0)
pricesdf = pd.read_csv('./data/price.csv', index_col = 0)
productsdf = pd.read_csv('./data/product.csv', index_col = 0)
storesdf = pd.read_csv('./data/store.csv', index_col = 0)
vendorsdf = pd.read_csv('./data/vendor.csv', index_col = 0)
categoriesdf = pd.read_csv('./data/category.csv', index_col = 0)
transactionsdf = pd.read_csv('./data/transaction.csv', index_col = 0)

  mask |= (ar1 == a)


In [52]:
#creates lists of college towns and zip codes
college_towns = ['Ames', 'Iowa City', 'Waterloo', 'Cedar Falls', 'Waterloo-Cedar Falls']
college_zips = [50010, 50011, 50012, 50013, 50014, 52240, 52242, 52243, 52244, 52245, 
                52246, 50701, 50702, 50703, 50704, 50707, 50613, 50614]

##### Checking and converting data types

In [53]:
countiesdf.head()

Unnamed: 0,CountyNumber,County
0,68,Monroe
1,17,Cerro Gordo
2,1,Adair
3,85,Story
4,21,Clay


In [54]:
#checks data types in countiesdf
print(countiesdf.dtypes)

CountyNumber     int64
County          object
dtype: object


In [55]:
pricesdf.head()

Unnamed: 0,ItemNumber,StateBottleCost,StateBottleRetail,Date
28783,42723,7.49,11.24,2018-01-02
30030,54448,11.29,16.94,2018-01-02
26935,22151,12.45,18.68,2018-01-02
28770,34546,7.0,10.5,2018-01-02
26937,43316,11.99,17.99,2018-01-02


In [56]:
#checks data types in pricesdf
print(pricesdf.dtypes)

ItemNumber             int64
StateBottleCost      float64
StateBottleRetail    float64
Date                  object
dtype: object


In [57]:
#convert date column to date type
pricesdf['Date'] = pd.to_datetime(pricesdf['Date'], infer_datetime_format=True)

In [58]:
#checks that date column is now date type
print(pricesdf.dtypes)

ItemNumber                    int64
StateBottleCost             float64
StateBottleRetail           float64
Date                 datetime64[ns]
dtype: object


In [59]:
productsdf.head()

Unnamed: 0,ItemNumber,ItemDescription,Pack,BottleVolume(ml)
0,36308,Hawkeye Vodka,6,1750
1,45888,Sailor Jerry Spiced Navy Rum,6,1750
2,55084,Paramount Blackberry Brandy,24,375
3,67557,Kamora Coffee Liqueur,12,1000
4,77487,Tortilla Gold Dss,12,1000


In [60]:
#checks data types in productsdf
print(productsdf.dtypes)

ItemNumber           int64
ItemDescription     object
Pack                 int64
BottleVolume(ml)     int64
dtype: object


In [61]:
storesdf.head()

Unnamed: 0,StoreNumber,StoreName,City,ZipCode,CountyNumber,Date
24166,5224,Keystone Liquor & Wine / Coralville,Coralville,52241.0,52,2018-01-02
24130,4872,Casey's General Store #2644 / Earlham,Earlham,50072.0,61,2018-01-02
21828,5254,Discount Liquor,Cedar Rapids,52402.0,57,2018-01-02
23760,4485,DYNO'S 51 / SANBORN,Sanborn,51248.0,71,2018-01-02
24246,4900,Kwik Stop 4 / Waterloo,Waterloo,50703.0,7,2018-01-02


In [62]:
#checks data types in storesdf
print(storesdf.dtypes)

StoreNumber       int64
StoreName        object
City             object
ZipCode         float64
CountyNumber      int64
Date             object
dtype: object


In [63]:
#convert date column to date type
storesdf['Date'] = pd.to_datetime(storesdf['Date'], infer_datetime_format=True)

In [64]:
#checks that date column is now date type
print(storesdf.dtypes)

StoreNumber              int64
StoreName               object
City                    object
ZipCode                float64
CountyNumber             int64
Date            datetime64[ns]
dtype: object


In [65]:
#convert zip code to integer type and convert NA's to 99999
storesdf['ZipCode'] = storesdf['ZipCode'].fillna(99999).astype(int)

In [66]:
#checks that zip code column is now int type
print(storesdf.dtypes)

StoreNumber              int64
StoreName               object
City                    object
ZipCode                  int32
CountyNumber             int64
Date            datetime64[ns]
dtype: object


In [67]:
vendorsdf.head()

Unnamed: 0,VendorName,VendorNumber
0,Luxco-St Louis,434.0
1,"WILLIAM GRANT AND SONS, INC.",240.0
2,Jim Beam Brands,65.0
3,"Sazerac Co., Inc.",421.0
4,Phillips Beverage Company,380.0


In [68]:
print(vendorsdf.dtypes)

VendorName       object
VendorNumber    float64
dtype: object


In [69]:
#converts vendor numbers to int type
vendorsdf['VendorNumber'] = vendorsdf['VendorNumber'].astype(int)

In [70]:
#checks that vendor numbers are now int type
print(vendorsdf.dtypes)

VendorName      object
VendorNumber     int32
dtype: object


In [71]:
categoriesdf.head()

Unnamed: 0,Category,CategoryName
73,1031100,100 PROOF VODKA
17660,1022200,100% Agave Tequila
4478,1082010,AMARETTO - IMPORTED
419,1101100,AMERICAN ALCOHOL
199,1081010,AMERICAN AMARETTO


In [72]:
print(categoriesdf.dtypes)

Category         int64
CategoryName    object
dtype: object


In [73]:
transactionsdf.head()

Unnamed: 0_level_0,Date,StoreNumber,Category,VendorNumber,ItemNumber,BottlesSold,Sale
InvoiceNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
INV-09556300101,2018-01-02,5427,1032200,370,34117,2,44.98
INV-09554900093,2018-01-02,5054,1082000,192,65251,1,44.3
INV-09550400102,2018-01-02,2549,1031100,380,37348,6,68.4
INV-09553400013,2018-01-02,5060,1011400,85,26821,1,13.59
INV-09567800102,2018-01-02,2505,1081600,434,86833,2,13.98


In [74]:
print(transactionsdf.dtypes)

Date             object
StoreNumber       int64
Category          int64
VendorNumber      int64
ItemNumber        int64
BottlesSold       int64
Sale            float64
dtype: object


In [75]:
#convert date column to date type
transactionsdf['Date'] = pd.to_datetime(transactionsdf['Date'], infer_datetime_format=True)

In [76]:
#checks that date column is now date type
print(transactionsdf.dtypes)

Date            datetime64[ns]
StoreNumber              int64
Category                 int64
VendorNumber             int64
ItemNumber               int64
BottlesSold              int64
Sale                   float64
dtype: object


##### Modifying Categories DF

In [78]:
categoriesdf.head()

Unnamed: 0,Category,CategoryName
73,1031100,100 PROOF VODKA
17660,1022200,100% Agave Tequila
4478,1082010,AMARETTO - IMPORTED
419,1101100,AMERICAN ALCOHOL
199,1081010,AMERICAN AMARETTO


In [94]:
#add new general category column to categories df
categoriesdf['GenCategory'] = pd.NaT

In [98]:
#fills NA category names as none
categoriesdf['CategoryName'] = categoriesdf['CategoryName'].fillna('None')

In [162]:
#fill general category based on category name
categoriesdf.loc[categoriesdf['CategoryName'].str.contains('VODKA|vodka|Vodka', regex = True), 'GenCategory'] = 'Vodka'
categoriesdf.loc[categoriesdf['CategoryName'].str.contains('TEQUILA|tequila|Tequila', regex = True), 'GenCategory'] = 'Tequila'
categoriesdf.loc[categoriesdf['CategoryName'].str.contains('AMARETTO|amaretto|Amaretto|ANISETTE|SCHNAPPS|Schnapps|Liqueur|Liqueurs|LIQEURS|LIQUEURS|CREME DE ALMOND|CREME DE CACAO|GREEN CREME|TRIPLE SEC|Triple Sec|CREME DE MENTHE', regex = True), 'GenCategory'] = 'Liqueur'
categoriesdf.loc[categoriesdf['CategoryName'].str.contains('COCKTAILS|COCKTAIL|cocktails|cocktail|Cocktails|Cocktail', regex = True), 'GenCategory'] = 'Cocktials'
categoriesdf.loc[categoriesdf['CategoryName'].str.contains('GINS|GIN|gins|gin|Gins|Gin', regex = True), 'GenCategory'] = 'Gin'
categoriesdf.loc[categoriesdf['CategoryName'].str.contains('BRANDIES|BRANDY|brandies|brandy|Brandies|Brandy', regex = True), 'GenCategory'] = 'Brandy'
categoriesdf.loc[categoriesdf['CategoryName'].str.contains('RUM|rum|Rum', regex = True), 'GenCategory'] = 'Rum'
categoriesdf.loc[categoriesdf['CategoryName'].str.contains('WHISKIES|WHISKY|WHISKEY|whiskies|whisky|whiskey|Whiskies|Whisky|Whiskey|BOURBON|bourbon|Bourbon|SCOTCH|Scotch', regex = True), 'GenCategory'] = 'Whiskey'
categoriesdf.loc[categoriesdf['CategoryName'].str.contains('HIGH PROOF BEER', regex = True), 'GenCategory'] = 'High Proof Beer'
categoriesdf.loc[categoriesdf['CategoryName'].str.contains('Mezcal', regex = True), 'GenCategory'] = 'Mezcal'
categoriesdf.loc[categoriesdf['CategoryName'].str.contains('ROCK & RYE', regex = False), 'GenCategory'] = 'Liqueur'

In [165]:
#fills all other general categories as unknown
categoriesdf['GenCategory'] = categoriesdf['GenCategory'].fillna('Unknown')

In [167]:
categoriesdf.head(60)

Unnamed: 0,Category,CategoryName,GenCategory
73,1031100,100 PROOF VODKA,Vodka
17660,1022200,100% Agave Tequila,Tequila
4478,1082010,AMARETTO - IMPORTED,Liqueur
419,1101100,AMERICAN ALCOHOL,Unknown
199,1081010,AMERICAN AMARETTO,Liqueur
16,1071100,AMERICAN COCKTAILS,Cocktials
10,1041100,AMERICAN DRY GINS,Gin
23,1051010,AMERICAN GRAPE BRANDIES,Brandy
1759,1041200,AMERICAN SLOE GINS,Gin
5670,1081020,ANISETTE,Liqueur
