In [1]:
'''
Reference:
https://scotch.io/tutorials/an-introduction-to-regex-in-python
https://regex101.com/    # for testing of regular expression
https://towardsdatascience.com/how-to-show-all-columns-rows-of-a-pandas-dataframe-c49d4507fcf
https://pbpython.com/text-cleaning.html
https://dataprep.ai/    # for data cleaning but in development stage
https://github.com/thuynh323/Natural-language-processing/blob/master/FuzzyWuzzy%20-%20Ramen%20Rater%20List/Find%20similar%20strings%20with%20FuzzyWuzzy.ipynb    # for fuzzy matching
'''
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('2019_Iowa_Liquor_Sales.csv') # download at https://data.iowa.gov/Sales-Distribution/2019-Iowa-Liquor-Sales/38x4-vs5h

In [3]:
pd.set_option('max_rows', None) # view more rows

In [4]:
df[['Store Name', 'Sale (Dollars)']].head(300) # From the pattern of store names, some similar store names found. We want to see the total Sales for each group.

Unnamed: 0,Store Name,Sale (Dollars)
0,Sauce,224.64
1,Sauce,207.0
2,Sauce,115.68
3,Hy-Vee Food Store / Dubuque,75.12
4,Kum & Go #121 / Urbandale,66.96
5,Hy-Vee Food Store / Dubuque,56.64
6,Hy-Vee Food Store / Dubuque,680.16
7,Hy-Vee Food Store / Dubuque,218.88
8,IDA Liquor,85.5
9,Lake View Foods,148.68


In [5]:
df['Store Name'].nunique() # nunique() function with default parameters gives a count of all the distinct values in each column.

1759

In [6]:
# group similar store names to a group
criteria = [r'Casey\'s General Store.*', r'Fareway Stores.*', r'Hy-Vee.*', r'Kum & Go.*', r'Walgreens.*']
new_name = ['Casey\'s General Store', 'Fareway_Stores', 'Hy-Vee', 'Kum_&_Go', 'Walgreens']

In [7]:
df['group_store_name'] = df['Store Name'].replace(to_replace=criteria, value=new_name, regex=True) # regex=True: whether to interpret to_replace and/or value as regular expressions.

In [8]:
df[['Store Name', 'group_store_name', 'Sale (Dollars)']].head(300)

Unnamed: 0,Store Name,group_store_name,Sale (Dollars)
0,Sauce,Sauce,224.64
1,Sauce,Sauce,207.0
2,Sauce,Sauce,115.68
3,Hy-Vee Food Store / Dubuque,Hy-Vee,75.12
4,Kum & Go #121 / Urbandale,Kum_&_Go,66.96
5,Hy-Vee Food Store / Dubuque,Hy-Vee,56.64
6,Hy-Vee Food Store / Dubuque,Hy-Vee,680.16
7,Hy-Vee Food Store / Dubuque,Hy-Vee,218.88
8,IDA Liquor,IDA Liquor,85.5
9,Lake View Foods,Lake View Foods,148.68


In [9]:
df['group_store_name'].nunique()

931

In [10]:
# about half of the store name became a group to count

In [11]:
# sum of sales of each company group
df.set_index(['group_store_name'], inplace=True)

In [12]:
ndf = df['Sale (Dollars)'].sum(level='group_store_name')

In [13]:
pd.DataFrame(ndf).sort_index() # sum of sale amount by group

Unnamed: 0_level_0,Sale (Dollars)
group_store_name,Unnamed: 1_level_1
'Da Booze Barn / West Bend,35002.38
10th Hole Inn & Suite / Gift Shop,4913.66
1st Stop Beverage Shop,396513.4
218 Fuel Express,174533.3
218 Fuel Express & Chubby's Liquor,58063.45
380BP / Swisher,64649.19
6 Corners Gas & Grub,16479.79
7 Rayos Liquor Store,455568.9
7Star Liquor & Tobacco Outlet,170870.6
A to Z Liquor,206039.2
