```
% wc -l data/raw/*

 3000001 data/raw/Books_rating.csv
 19666764 data/raw/Liquor_Sales.csv
   18384 data/raw/Sales_April_2019.csv
   12012 data/raw/Sales_August_2019.csv
   25118 data/raw/Sales_December_2019.csv
   12037 data/raw/Sales_February_2019.csv
    9724 data/raw/Sales_January_2019.csv
   14372 data/raw/Sales_July_2019.csv
   13623 data/raw/Sales_June_2019.csv
   15227 data/raw/Sales_March_2019.csv
   16636 data/raw/Sales_May_2019.csv
   17662 data/raw/Sales_November_2019.csv
   20380 data/raw/Sales_October_2019.csv
   11687 data/raw/Sales_September_2019.csv
  212405 data/raw/books_data.csv

% head data/raw/Sales_January_2019.csv

Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
141234,iPhone,1,700,01/22/19 21:25,"944 Walnut St, Boston, MA 02215"
141235,Lightning Charging Cable,1,14.95,01/28/19 14:15,"185 Maple St, Portland, OR 97035"
141236,Wired Headphones,2,11.99,01/17/19 13:33,"538 Adams St, San Francisco, CA 94016"
141237,27in FHD Monitor,1,149.99,01/05/19 20:33,"738 10th St, Los Angeles, CA 90001"
141238,Wired Headphones,1,11.99,01/25/19 11:59,"387 10th St, Austin, TX 73301"
141239,AAA Batteries (4-pack),1,2.99,01/29/19 20:22,"775 Willow St, San Francisco, CA 94016"
141240,27in 4K Gaming Monitor,1,389.99,01/26/19 12:16,"979 Park St, Los Angeles, CA 90001"

% grep ',\s*,' data/raw/Sales_January_2019.csv -c
26
```

In [1]:
import numpy as np
import pandas as pd
import pandas_categorical as pdc

In [2]:
df = pd.read_csv(
        "data/raw/Sales_January_2019.csv",
        engine= 'python', # 'pyarrow'
        on_bad_lines='warn'
    )
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,141234,iPhone,1,700.0,01/22/19 21:25,"944 Walnut St, Boston, MA 02215"
1,141235,Lightning Charging Cable,1,14.95,01/28/19 14:15,"185 Maple St, Portland, OR 97035"
2,141236,Wired Headphones,2,11.99,01/17/19 13:33,"538 Adams St, San Francisco, CA 94016"
3,141237,27in FHD Monitor,1,149.99,01/05/19 20:33,"738 10th St, Los Angeles, CA 90001"
4,141238,Wired Headphones,1,11.99,01/25/19 11:59,"387 10th St, Austin, TX 73301"


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9723 entries, 0 to 9722
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Order ID          9697 non-null   object
 1   Product           9697 non-null   object
 2   Quantity Ordered  9697 non-null   object
 3   Price Each        9697 non-null   object
 4   Order Date        9697 non-null   object
 5   Purchase Address  9697 non-null   object
dtypes: object(6)
memory usage: 455.9+ KB


In [4]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_numeric.html
def numeric_or_nan(df, col, cast):
    df[col] = pd.to_numeric(df[col], errors='coerce', downcast=cast)

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html
def datetime_or_nan(df, col):
    df[col] = pd.to_datetime(df[col], errors='coerce', format="%m/%d/%Y %H:%M")

numeric_or_nan(df, 'Order ID', 'unsigned')
numeric_or_nan(df, 'Quantity Ordered', 'unsigned')
numeric_or_nan(df, 'Price Each', 'float')

datetime_or_nan(df, 'Order Date')
df = df.astype({'Product':'category', 'Purchase Address': 'category'},copy=False)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9723 entries, 0 to 9722
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Order ID          9681 non-null   float64       
 1   Product           9697 non-null   category      
 2   Quantity Ordered  9681 non-null   float64       
 3   Price Each        9681 non-null   float32       
 4   Order Date        0 non-null      datetime64[ns]
 5   Purchase Address  9697 non-null   category      
dtypes: category(2), datetime64[ns](1), float32(1), float64(2)
memory usage: 624.8 KB


In [5]:
pd.value_counts(df.Product).to_frame().reset_index()

Unnamed: 0,Product,count
0,USB-C Charging Cable,1171
1,AAA Batteries (4-pack),1082
2,Lightning Charging Cable,1068
3,AA Batteries (4-pack),1039
4,Wired Headphones,1005
5,Apple Airpods Headphones,809
6,Bose SoundSport Headphones,656
7,27in FHD Monitor,418
8,iPhone,379
9,Google Phone,317


In [6]:
pd.value_counts(df['Purchase Address']).head(40)

Purchase Address
Purchase Address                              16
788 Washington St, San Francisco, CA 94016     4
821 West St, Los Angeles, CA 90001             4
961 Jefferson St, New York City, NY 10001      4
381 Wilson St, San Francisco, CA 94016         4
543 14th St, Boston, MA 02215                  3
112 River St, Seattle, WA 98101                3
540 North St, Atlanta, GA 30301                3
712 Madison St, San Francisco, CA 94016        3
714 Washington St, San Francisco, CA 94016     3
303 10th St, Boston, MA 02215                  3
399 Church St, Boston, MA 02215                3
31 Jefferson St, Atlanta, GA 30301             3
791 Ridge St, San Francisco, CA 94016          3
787 Chestnut St, Dallas, TX 75001              3
785 Cedar St, Boston, MA 02215                 3
985 10th St, San Francisco, CA 94016           3
841 Hickory St, Los Angeles, CA 90001          3
752 2nd St, Austin, TX 73301                   3
236 Lincoln St, Los Angeles, CA 90001          3
641