# TOC
1. Import Libraries & Data     
2. Inspecting filtered data   
3. Renaming columns to best practices (no capitalization, special characters)   
4. Converting date column to datetime format   
5. Checking for null values   
6. Separating 'zip_code' nulls into csv for updating   
7. Separating 'category' nulls into csv for updating  
8. Dropping 'store_location' and 'county_num' columns (not needed)
9. Dropping 'zip_code' nulls & 'category' nulls from main data (merge prep)  
10. Setting correct datatypes   
11. Merging dataframes via pd.concat & checking null values     
12. Resetting dataframe index   
13. Correcting minor issues (removing commas from address fields, correcting misspelled city names)   
14. Exporting final cleaned dataframe to pickle to begin analysis. 

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os
import datetime

In [2]:
# Importing filtered data
path = r'C:\Users\Ryzen RGB Madness!!!\Iowa Liquor Sales Analysis'

In [3]:
liquor_data = pd.read_pickle(os.path.join(path, '01 - Data', 'Original', 'iowa_liquor_sales_jan18_to_dec22_orig.pkl'))

In [4]:
# Inspecting filtered data
liquor_data.shape

(12537545, 24)

In [5]:
# Checking for null values
liquor_data.isnull().sum()

invoice_number               0
date                         0
store_number                 0
store_name                   0
address                   8578
city                      8578
zip_code                  8600
store_location         1208917
county_num             1253835
county                    8578
category                  5621
category_name             5621
vendor_number                7
vendor_name                  7
item_number                  0
item_description             0
pack                         0
bottle_volume_ml             0
state_bottle_cost            0
state_bottle_retail          0
bottles_sold                 0
sale_dollars                 0
volume_sold_liters           0
volume_sold_gallons          0
dtype: int64

In [6]:
# Separating zip code nulls into separate dataframe
address_issues = liquor_data[liquor_data['zip_code'].isnull()==True]

In [7]:
address_issues.shape

(8600, 24)

In [8]:
# Saving missing addresses to csv to impute data (check against records in main dataset)
address_issues.to_csv(os.path.join(path, '01 - Data', 'Original', 'iowa_liquor_sales_jan18_to_dec22_missing_addys2.csv'))

In [9]:
# Separating category nulls into separate dataframe
category_issues = liquor_data[liquor_data['category'].isnull()==True]

In [10]:
category_issues.shape

(5621, 24)

In [11]:
# Saving missing categories to csv to impute data (check against records in main dataset)
category_issues.to_csv(os.path.join(path, '01 - Data', 'Original', 'iowa_liquor_sales_jan18_to_dec22_missing_cats2.csv'))

In [12]:
# Removing nulls
liquor_data_updated = liquor_data.dropna(axis=0,subset=['address'])

In [13]:
liquor_data_updated.shape

(12528967, 24)

In [14]:
# Checking for null values
liquor_data_updated.isnull().sum()

invoice_number               0
date                         0
store_number                 0
store_name                   0
address                      0
city                         0
zip_code                    22
store_location         1200339
county_num             1245257
county                       0
category                  5610
category_name             5610
vendor_number                7
vendor_name                  7
item_number                  0
item_description             0
pack                         0
bottle_volume_ml             0
state_bottle_cost            0
state_bottle_retail          0
bottles_sold                 0
sale_dollars                 0
volume_sold_liters           0
volume_sold_gallons          0
dtype: int64

In [15]:
liquor_data_updated = liquor_data.dropna(axis=0,subset=['zip_code'])

In [16]:
liquor_data_updated.shape

(12528945, 24)

In [17]:
# Checking for null values
liquor_data_updated.isnull().sum()

invoice_number               0
date                         0
store_number                 0
store_name                   0
address                      0
city                         0
zip_code                     0
store_location         1200339
county_num             1245235
county                       0
category                  5610
category_name             5610
vendor_number                7
vendor_name                  7
item_number                  0
item_description             0
pack                         0
bottle_volume_ml             0
state_bottle_cost            0
state_bottle_retail          0
bottles_sold                 0
sale_dollars                 0
volume_sold_liters           0
volume_sold_gallons          0
dtype: int64

In [18]:
liquor_data_updated = liquor_data_updated.dropna(axis=0,subset=['category'])

In [19]:
liquor_data_updated.shape

(12523335, 24)

In [20]:
# Checking for null values
liquor_data_updated.isnull().sum()

invoice_number               0
date                         0
store_number                 0
store_name                   0
address                      0
city                         0
zip_code                     0
store_location         1199924
county_num             1245235
county                       0
category                     0
category_name                0
vendor_number                7
vendor_name                  7
item_number                  0
item_description             0
pack                         0
bottle_volume_ml             0
state_bottle_cost            0
state_bottle_retail          0
bottles_sold                 0
sale_dollars                 0
volume_sold_liters           0
volume_sold_gallons          0
dtype: int64

In [21]:
# Removing store_location & county_num from main dataframe
liquor_data_updated = liquor_data_updated.drop(columns=['store_location','county_num'])

In [22]:
# Importing updated csv data
updated_info = pd.read_csv(os.path.join(path, '01 - Data', 'Original', 'iowa_liquor_sales_updated_info.csv'))

In [23]:
updated_info.shape

(14210, 23)

In [24]:
liquor_data_updated.shape

(12523335, 22)

In [25]:
liquor_data_updated.describe()

Unnamed: 0,date,store_number,category,vendor_number,pack,bottle_volume_ml,state_bottle_cost,state_bottle_retail,bottles_sold,sale_dollars,volume_sold_liters,volume_sold_gallons
count,12523335,12523340.0,12523340.0,12523330.0,12523340.0,12523340.0,12523340.0,12523340.0,12523340.0,12523340.0,12523340.0,12523340.0
mean,2020-07-26 08:48:33.829837824,4040.714,1054589.0,278.4899,12.15238,849.5249,11.00302,16.50641,11.4668,154.4708,9.310626,2.454669
min,2018-01-02 00:00:00,2106.0,1011000.0,10.0,1.0,20.0,0.33,0.5,-360.0,-9720.0,-630.0,-166.42
25%,2019-05-08 00:00:00,2633.0,1012200.0,205.0,6.0,600.0,5.78,8.67,3.0,38.25,1.5,0.39
50%,2020-08-07 00:00:00,4106.0,1031200.0,260.0,12.0,750.0,8.74,13.11,6.0,81.0,4.8,1.26
75%,2021-10-19 00:00:00,5113.0,1062500.0,395.0,12.0,1000.0,13.5,20.25,12.0,161.64,10.5,2.77
max,2022-12-30 00:00:00,10106.0,1901200.0,978.0,120.0,6000.0,18436.0,27654.0,13200.0,279557.3,13200.0,3487.07
std,,1247.433,99828.48,142.9499,7.982139,524.3411,12.04768,18.07121,33.25281,544.1892,39.38463,10.4045


In [26]:
updated_info.dtypes

Unnamed: 0               int64
invoice_number          object
date                    object
store_number             int64
store_name              object
address                 object
city                    object
zip_code                 int64
county                  object
category                 int64
category_name           object
vendor_number            int64
vendor_name             object
item_number              int64
item_description        object
pack                     int64
bottle_volume_ml         int64
state_bottle_cost      float64
state_bottle_retail    float64
bottles_sold             int64
sale_dollars           float64
volume_sold_liters     float64
volume_sold_gallons    float64
dtype: object

In [27]:
# Converting updated_info 'date' column to datetime
updated_info['date'] = pd.to_datetime(updated_info['date'], format="%m/%d/%Y")

In [28]:
updated_info.dtypes

Unnamed: 0                      int64
invoice_number                 object
date                   datetime64[ns]
store_number                    int64
store_name                     object
address                        object
city                           object
zip_code                        int64
county                         object
category                        int64
category_name                  object
vendor_number                   int64
vendor_name                    object
item_number                     int64
item_description               object
pack                            int64
bottle_volume_ml                int64
state_bottle_cost             float64
state_bottle_retail           float64
bottles_sold                    int64
sale_dollars                  float64
volume_sold_liters            float64
volume_sold_gallons           float64
dtype: object

In [29]:
updated_info['date'].describe()

count                            14210
mean     2019-09-11 17:18:36.228008192
min                2018-01-02 00:00:00
25%                2018-12-12 00:00:00
50%                2019-07-09 00:00:00
75%                2019-11-26 00:00:00
max                2022-10-31 00:00:00
Name: date, dtype: object

In [30]:
# Removing index column
updated_info.dtypes

Unnamed: 0                      int64
invoice_number                 object
date                   datetime64[ns]
store_number                    int64
store_name                     object
address                        object
city                           object
zip_code                        int64
county                         object
category                        int64
category_name                  object
vendor_number                   int64
vendor_name                    object
item_number                     int64
item_description               object
pack                            int64
bottle_volume_ml                int64
state_bottle_cost             float64
state_bottle_retail           float64
bottles_sold                    int64
sale_dollars                  float64
volume_sold_liters            float64
volume_sold_gallons           float64
dtype: object

In [31]:
updated_info = updated_info.drop('Unnamed: 0', axis=1)

In [32]:
updated_info.columns

Index(['invoice_number', 'date', 'store_number', 'store_name', 'address',
       'city', 'zip_code', 'county', 'category', 'category_name',
       'vendor_number', 'vendor_name', 'item_number', 'item_description',
       'pack', 'bottle_volume_ml', 'state_bottle_cost', 'state_bottle_retail',
       'bottles_sold', 'sale_dollars', 'volume_sold_liters',
       'volume_sold_gallons'],
      dtype='object')

In [33]:
# Merging dataframes (test)
pd.concat([liquor_data_updated, updated_info], axis=0)

Unnamed: 0,invoice_number,date,store_number,store_name,address,city,zip_code,county,category,category_name,...,item_number,item_description,pack,bottle_volume_ml,state_bottle_cost,state_bottle_retail,bottles_sold,sale_dollars,volume_sold_liters,volume_sold_gallons
680506,INV-14210900001,2018-09-04,4617,LICKETY LIQUOR,2501 HUBBELL AVE,DES MOINES,50317,POLK,1031100.0,AMERICAN VODKAS,...,36904,MCCORMICK VODKA PET,24,375,1.80,2.70,48,129.60,18.00,4.75
1377843,INV-16812800013,2019-01-08,5151,IDA LIQUOR,"500, HWY 175",IDA GROVE,51445,IDA,1031100.0,AMERICAN VODKAS,...,38176,TITOS HANDMADE VODKA,12,750,9.64,14.46,12,173.52,9.00,2.37
1382870,INV-18087600031,2019-03-12,5151,IDA LIQUOR,"500, HWY 175",IDA GROVE,51445,IDA,1011600.0,STRAIGHT RYE WHISKIES,...,27102,TEMPLETON 4YR RYE,6,750,18.09,27.14,6,162.84,4.50,1.18
1385902,INV-15226900023,2018-10-23,5151,IDA LIQUOR,"500, HWY 175",IDA GROVE,51445,IDA,1031100.0,AMERICAN VODKAS,...,35918,FIVE O'CLOCK VODKA,6,1750,7.20,10.80,6,64.80,10.50,2.77
1385954,INV-17301000008,2019-02-01,2445,RUBACK'S FOOD CENTER,504 SOUTH HIGHWAY,OAKLAND,51560,POTTAWATTAMIE,1031100.0,AMERICAN VODKAS,...,36306,HAWKEYE VODKA,12,750,3.34,5.01,12,60.12,9.00,2.37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14205,INV-52593300018,2022-10-31,10081,COLFAX TRAVEL CENTER / COLFAX,1405 NORTH WALNUT STREET,COLFAX,50054,JASPER,1011300.0,SINGLE BARREL BOURBON WHISKIES,...,27380,ANGELS ENVY RYE,6,750,44.99,67.49,1,67.49,0.75,0.19
14206,INV-52593300019,2022-10-31,10081,COLFAX TRAVEL CENTER / COLFAX,1405 NORTH WALNUT STREET,COLFAX,50054,JASPER,1081400.0,AMERICAN SCHNAPPS,...,84226,99 STRAWBERRIES MINI,10,50,5.16,7.74,1,7.74,0.05,0.01
14207,INV-52593300020,2022-10-31,10081,COLFAX TRAVEL CENTER / COLFAX,1405 NORTH WALNUT STREET,COLFAX,50054,JASPER,1081400.0,AMERICAN SCHNAPPS,...,84222,99 BLUE RASPBERRIES MINI,10,50,5.16,7.74,1,7.74,0.05,0.01
14208,INV-52593300021,2022-10-31,10081,COLFAX TRAVEL CENTER / COLFAX,1405 NORTH WALNUT STREET,COLFAX,50054,JASPER,1091300.0,NEUTRAL GRAIN SPIRITS FLAVORED,...,76067,MIDNIGHT MOON WATERMELON MINI,4,50,17.16,25.74,1,25.74,0.05,0.01


In [34]:
# Merging dataframes into new dataframe
liquor_data_complete = pd.concat([liquor_data_updated, updated_info], axis=0)

In [35]:
liquor_data_complete.shape

(12537545, 22)

In [36]:
liquor_data_complete['date'].describe()

count                         12537545
mean     2020-07-26 00:08:30.314642432
min                2018-01-02 00:00:00
25%                2019-05-08 00:00:00
50%                2020-08-06 00:00:00
75%                2021-10-19 00:00:00
max                2022-12-30 00:00:00
Name: date, dtype: object

In [38]:
# Resetting index
liquor_data_complete.reset_index(inplace=True, drop=True)

In [39]:
liquor_data_complete.dtypes

invoice_number                 object
date                   datetime64[ns]
store_number                    int64
store_name                     object
address                        object
city                           object
zip_code                       object
county                         object
category                      float64
category_name                  object
vendor_number                 float64
vendor_name                    object
item_number                    object
item_description               object
pack                            int64
bottle_volume_ml                int64
state_bottle_cost             float64
state_bottle_retail           float64
bottles_sold                    int64
sale_dollars                  float64
volume_sold_liters            float64
volume_sold_gallons           float64
dtype: object

In [40]:
liquor_data_complete.describe()

Unnamed: 0,date,store_number,category,vendor_number,pack,bottle_volume_ml,state_bottle_cost,state_bottle_retail,bottles_sold,sale_dollars,volume_sold_liters,volume_sold_gallons
count,12537545,12537540.0,12537540.0,12537540.0,12537540.0,12537540.0,12537540.0,12537540.0,12537540.0,12537540.0,12537540.0,12537540.0
mean,2020-07-26 00:08:30.314642432,4040.913,1054603.0,278.446,12.15107,849.5272,11.00411,16.50806,11.46606,154.4963,9.310477,2.45463
min,2018-01-02 00:00:00,2106.0,1011000.0,10.0,1.0,20.0,0.33,0.5,-360.0,-9720.0,-630.0,-166.42
25%,2019-05-08 00:00:00,2633.0,1012200.0,205.0,6.0,600.0,5.78,8.67,3.0,38.25,1.5,0.39
50%,2020-08-06 00:00:00,4106.0,1031200.0,260.0,12.0,750.0,8.74,13.11,6.0,81.0,4.8,1.26
75%,2021-10-19 00:00:00,5113.0,1062500.0,395.0,12.0,1000.0,13.5,20.25,12.0,161.64,10.5,2.77
max,2022-12-30 00:00:00,10106.0,1901200.0,978.0,120.0,6000.0,18436.0,27654.0,13200.0,279557.3,13200.0,3487.07
std,,1248.485,99844.52,142.962,7.980532,524.3446,12.04841,18.07231,33.25119,544.3291,39.3824,10.40391


In [42]:
# Correcting data types
convert_dict = {'invoice_number': str,
                'store_number': str,
                'store_name': str,
                'address': str,
                'city': str,
                'zip_code': str,
                'county': str,
                'category': str,
                'category_name': str,
                'vendor_number': str,
                'vendor_name': str,
                'item_number': str,
                'item_description': str                
                }

In [43]:
liquor_data_complete = liquor_data_complete.astype(convert_dict)

In [44]:
liquor_data_complete.dtypes

invoice_number                 object
date                   datetime64[ns]
store_number                   object
store_name                     object
address                        object
city                           object
zip_code                       object
county                         object
category                       object
category_name                  object
vendor_number                  object
vendor_name                    object
item_number                    object
item_description               object
pack                            int64
bottle_volume_ml                int64
state_bottle_cost             float64
state_bottle_retail           float64
bottles_sold                    int64
sale_dollars                  float64
volume_sold_liters            float64
volume_sold_gallons           float64
dtype: object

In [49]:
# Converting non-numerical number identifiers (store number, category, vendor number, item number) to string
liquor_data_complete['store_number'] = liquor_data_complete['store_number'].astype('string')

In [50]:
liquor_data_complete.dtypes

invoice_number                 object
date                   datetime64[ns]
store_number           string[python]
store_name                     object
address                        object
city                           object
zip_code                       object
county                         object
category                       object
category_name                  object
vendor_number                  object
vendor_name                    object
item_number                    object
item_description               object
pack                            int64
bottle_volume_ml                int64
state_bottle_cost             float64
state_bottle_retail           float64
bottles_sold                    int64
sale_dollars                  float64
volume_sold_liters            float64
volume_sold_gallons           float64
dtype: object

In [51]:
liquor_data_complete['category'] = liquor_data_complete['category'].astype('string')

In [52]:
liquor_data_complete['vendor_number'] = liquor_data_complete['vendor_number'].astype('string')

In [53]:
liquor_data_complete['item_number'] = liquor_data_complete['item_number'].astype('string')

In [54]:
liquor_data_complete.dtypes

invoice_number                 object
date                   datetime64[ns]
store_number           string[python]
store_name                     object
address                        object
city                           object
zip_code                       object
county                         object
category               string[python]
category_name                  object
vendor_number          string[python]
vendor_name                    object
item_number            string[python]
item_description               object
pack                            int64
bottle_volume_ml                int64
state_bottle_cost             float64
state_bottle_retail           float64
bottles_sold                    int64
sale_dollars                  float64
volume_sold_liters            float64
volume_sold_gallons           float64
dtype: object

In [55]:
# Removing commas from address column
liquor_data_complete['address'] = liquor_data_complete['address'].str.replace(","," ")

In [56]:
# Checking commas
liquor_data_complete.loc[liquor_data_complete['invoice_number'] == 'INV-14279100033']

Unnamed: 0,invoice_number,date,store_number,store_name,address,city,zip_code,county,category,category_name,...,item_number,item_description,pack,bottle_volume_ml,state_bottle_cost,state_bottle_retail,bottles_sold,sale_dollars,volume_sold_liters,volume_sold_gallons
12525824,INV-14279100033,2018-09-06,5251,NORTHSIDE LIQUOR,1303 NORTH FEDERAL,MASON CITY,50401,CERRO GORDO,1062500.0,FLAVORED RUM,...,43086,BACARDI COCO,12,750,8.25,12.38,2,24.76,1.5,0.39


In [57]:
# Correcting misspelled city
liquor_data_complete['city'] = liquor_data_complete['city'].str.replace("OTUMWA","OTTUMWA")

In [58]:
# Checking city error
liquor_data_complete.loc[liquor_data_complete['city'] == 'OTUMWA']

Unnamed: 0,invoice_number,date,store_number,store_name,address,city,zip_code,county,category,category_name,...,item_number,item_description,pack,bottle_volume_ml,state_bottle_cost,state_bottle_retail,bottles_sold,sale_dollars,volume_sold_liters,volume_sold_gallons


In [59]:
# Correcting misspelled city
liquor_data_complete['city'] = liquor_data_complete['city'].str.replace("ARNOLD'S PARK","ARNOLDS PARK")

In [60]:
# Checking city error
liquor_data_complete.loc[liquor_data_complete['city'] == "ARNOLD'S PARK"]

Unnamed: 0,invoice_number,date,store_number,store_name,address,city,zip_code,county,category,category_name,...,item_number,item_description,pack,bottle_volume_ml,state_bottle_cost,state_bottle_retail,bottles_sold,sale_dollars,volume_sold_liters,volume_sold_gallons


In [62]:
# Verifying dates
liquor_data_complete['date'].describe()

count                         12537545
mean     2020-07-26 00:08:30.314642432
min                2018-01-02 00:00:00
25%                2019-05-08 00:00:00
50%                2020-08-06 00:00:00
75%                2021-10-19 00:00:00
max                2022-12-30 00:00:00
Name: date, dtype: object

In [63]:
# Exporting complete cleaned dataframe
liquor_data_complete.to_pickle(os.path.join(path, '01 - Data', 'Cleaned', 'iowa_liquor_data_complete.pkl'))