# Dataset Cleaning

## Import data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../original_data/dataset.csv', low_memory=False)

In [3]:
df.head()

Unnamed: 0,Order_ID,Product,Quantity_Ordered,Price_Each,Order_Date,Purchase_Address
0,,,,,,
1,,Product,,,,Purchase Address
2,0.0,,,,,
3,0.0,20in Monitor,1.0,1.0998999786376952e+16,2001-02-19 11:31:00.0000000,"80 2nd St, Los Angeles, CA 90001"
4,0.0,20in Monitor,1.0,1.0998999786376952e+16,2001-02-19 11:46:00.0000000,"420 Washington St, Boston, MA 02215"


## Rename columns to lower case

In [4]:
df.columns = [i.lower() for i in df.columns]

## Check `dtypes`

In [5]:
df.dtypes

order_id            float64
product              object
quantity_ordered    float64
price_each           object
order_date           object
purchase_address     object
dtype: object

## Check the different values of each column

In [6]:
for col in df.columns:
    print(f'Unique values for column {col}: {df[col].unique().shape[0]}\n')
    print(df[col].unique(), end='\n\n')

Unique values for column order_id: 168930

[    nan      0.  15502. ... 319668. 319669. 319670.]

Unique values for column product: 21

[nan 'Product' '20in Monitor' '27in 4K Gaming Monitor' '27in FHD Monitor'
 '34in Ultrawide Monitor' 'AA Batteries (4-pack)' 'AAA Batteries (4-pack)'
 'Apple Airpods Headphones' 'Bose SoundSport Headphones' 'Flatscreen TV'
 'Google Phone' 'iPhone' 'LG Dryer' 'LG Washing Machine'
 'Lightning Charging Cable' 'Macbook Pro Laptop' 'ThinkPad Laptop'
 'USB-C Charging Cable' 'Vareebadd Phone' 'Wired Headphones']

Unique values for column quantity_ordered: 10

[nan  1.  2.  3.  4.  5.  6.  7.  9.  8.]

Unique values for column price_each: 18

[nan '109,98999786376953' '389,989990234375' '149,99000549316406'
 '379,989990234375' '3,8399999141693115' '2,9900000095367432' '150'
 '99,989997863769531' '300' '600' '700' '14,949999809265137' '1700'
 '999,989990234375' '11,949999809265137' '400' '11,989999771118164']

Unique values for column order_date: 142396

[nan '2

## Cleaning and adjusting dtypes

Some records have `nan` values from importation. Check them and convert to `NaN`.

In [7]:
df.order_id = df.order_id.replace('nan', np.nan).astype(float)

In [8]:
df.quantity_ordered = df.quantity_ordered.replace('nan', np.nan).astype(float)

In [9]:
df.price_each = pd.Series( [i.replace(',', '.') if i == i else i for i in df.price_each] ).astype(float)

## Check `order_date` values

In [10]:
df.order_date

0                                 NaN
1                                 NaN
2                                 NaN
3         2001-02-19 11:31:00.0000000
4         2001-02-19 11:46:00.0000000
                     ...             
185685    2011-12-19 20:58:00.0000000
185686    2001-12-19 12:01:00.0000000
185687    2009-12-19 06:43:00.0000000
185688    2003-12-19 10:39:00.0000000
185689    2021-12-19 21:45:00.0000000
Name: order_date, Length: 185690, dtype: object

Dates are in the wrong format. I must remove the `20` in front of every value.

In [11]:
dates = [i[2:] if i == i else i for i in df.order_date ]

In [12]:
dates[:10]

[nan,
 nan,
 nan,
 '01-02-19 11:31:00.0000000',
 '01-02-19 11:46:00.0000000',
 '01-02-19 13:39:00.0000000',
 '01-02-19 14:35:00.0000000',
 '01-02-19 18:39:00.0000000',
 '01-02-19 20:08:00.0000000',
 '01-02-19 20:37:00.0000000']

I could have used this format as parameter to `pd.to_datetime()`: '%d-%m-%y %H:%M:%S.%f'. See [reference](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes).  
Using this format leads to getting dates only for April, so in the end, I will let Pandas to infer the dates.

In [13]:
df.sample(10)

Unnamed: 0,order_id,product,quantity_ordered,price_each,order_date,purchase_address
166578,301262.0,27in FHD Monitor,1.0,149.990005,2008-12-19 11:22:00.0000000,"829 Johnson St, Los Angeles, CA 90001"
143093,278703.0,27in 4K Gaming Monitor,1.0,389.98999,2015-10-19 08:19:00.0000000,"166 14th St, Dallas, TX 75001"
85640,223417.0,27in FHD Monitor,1.0,149.990005,2013-07-19 18:03:00.0000000,"238 River St, San Francisco, CA 94016"
118154,254727.0,Flatscreen TV,1.0,300.0,2012-09-19 12:44:00.0000000,"546 1st St, San Francisco, CA 94016"
176739,311059.0,USB-C Charging Cable,1.0,11.95,2002-12-19 17:41:00.0000000,"970 14th St, San Francisco, CA 94016"
74151,212384.0,USB-C Charging Cable,1.0,11.95,2014-06-19 12:49:00.0000000,"859 Madison St, Los Angeles, CA 90001"
30662,170681.0,Macbook Pro Laptop,1.0,1700.0,2020-03-19 11:57:00.0000000,"760 West St, New York City, NY 10001"
79667,217686.0,AAA Batteries (4-pack),1.0,2.99,2017-06-19 17:54:00.0000000,"134 2nd St, Portland, OR 97035"
67662,206162.0,27in 4K Gaming Monitor,1.0,389.98999,2002-05-19 14:49:00.0000000,"214 11th St, New York City, NY 10001"
24406,164656.0,Lightning Charging Cable,1.0,14.95,2010-03-19 19:45:00.0000000,"357 West St, Los Angeles, CA 90001"


In [14]:
pd.to_datetime(dates)

  pd.to_datetime(dates)


DatetimeIndex([                'NaT',                 'NaT',
                               'NaT', '2019-01-02 11:31:00',
               '2019-01-02 11:46:00', '2019-01-02 13:39:00',
               '2019-01-02 14:35:00', '2019-01-02 18:39:00',
               '2019-01-02 20:08:00', '2019-01-02 20:37:00',
               ...
               '2019-11-12 21:24:00', '2019-12-31 19:07:00',
               '2019-07-12 08:25:00', '2019-12-30 01:06:00',
               '2019-12-15 11:13:00', '2019-11-12 20:58:00',
               '2019-01-12 12:01:00', '2019-09-12 06:43:00',
               '2019-03-12 10:39:00', '2019-12-21 21:45:00'],
              dtype='datetime64[ns]', length=185690, freq=None)

In [15]:
df.order_date = pd.to_datetime(dates)

  df.order_date = pd.to_datetime(dates)


## Check dtypes again:

In [16]:
df.dtypes

order_id                   float64
product                     object
quantity_ordered           float64
price_each                 float64
order_date          datetime64[ns]
purchase_address            object
dtype: object

## Check missing values

In [17]:
df.isnull().sum()

order_id            2
product             2
quantity_ordered    4
price_each          4
order_date          4
purchase_address    2
dtype: int64

In [18]:
df[df.isnull().sum(axis=1) >= 2]

Unnamed: 0,order_id,product,quantity_ordered,price_each,order_date,purchase_address
0,,,,,NaT,
1,,Product,,,NaT,Purchase Address
2,0.0,,,,NaT,
7319,0.0,Product,,,NaT,Purchase Address


I will drop those records.

In [19]:
index_to_drop = df[df.isnull().sum(axis=1) >= 2].index
df = df.drop(index_to_drop)

## Change `dtypes` from `float` (presence of `nan` values) to `int`:

In [20]:
df.isnull().sum()

order_id            0
product             0
quantity_ordered    0
price_each          0
order_date          0
purchase_address    0
dtype: int64

In [21]:
df.quantity_ordered = df.quantity_ordered.astype(int)

## Add a column with the `month_name`:

In [22]:
df['month_name'] = df.order_date.dt.strftime('%B')

### Turn `month_name` to `categorical dtype`:  
I will turn the `month_name` column into an ordered categorical variable.

In [23]:
months_ordered = 'january, february, march, april, may, june, july, august, september, october, november, december'.title().split(', ')

In [24]:
df.month_name = df.month_name.astype(pd.CategoricalDtype(categories=months_ordered, ordered=True))

In [25]:
df.head()

Unnamed: 0,order_id,product,quantity_ordered,price_each,order_date,purchase_address,month_name
3,0.0,20in Monitor,1,109.989998,2019-01-02 11:31:00,"80 2nd St, Los Angeles, CA 90001",January
4,0.0,20in Monitor,1,109.989998,2019-01-02 11:46:00,"420 Washington St, Boston, MA 02215",January
5,0.0,20in Monitor,1,109.989998,2019-01-02 13:39:00,"649 2nd St, San Francisco, CA 94016",January
6,0.0,20in Monitor,1,109.989998,2019-01-02 14:35:00,"852 Jefferson St, San Francisco, CA 94016",January
7,0.0,20in Monitor,1,109.989998,2019-01-02 18:39:00,"467 Chestnut St, Portland, OR 97035",January


## City and state information

First, I need to extract information about the cities from where the orders were placed. 

In [26]:
addresses = df.purchase_address

In [27]:
addresses.sample(5)

15822    366 Sunset St, New York City, NY 10001
636       298 Maple St, New York City, NY 10001
41789          12 Highland St, Boston, MA 02215
57279            373 11th St, Atlanta, GA 30301
85366           825 North St, Seattle, WA 98101
Name: purchase_address, dtype: object

Technically, if I split the address using the comma as separator, I can easily get the city as the third element of the resulting list.

In [28]:
split_address_info = addresses.str.split(',')
split_address_info.sample(5)

141706     [281 Maple St,  Los Angeles,  CA 90001]
118731    [405 Wilson St,  Los Angeles,  CA 90001]
157919      [552 Dogwood St,  Portland,  OR 97035]
62338         [879 North St,  Portland,  OR 97035]
95382     [21 River St,  New York City,  NY 10001]
Name: purchase_address, dtype: object

In [29]:
cities = [i[1].strip() for i in split_address_info]
cities[:5]

['Los Angeles', 'Boston', 'San Francisco', 'San Francisco', 'Portland']

In [32]:
state_codes = [i[2].split()[0] for i in split_address_info]
state_codes[:5]

['CA', 'MA', 'CA', 'CA', 'OR']

Once I have checked the procedure works, let's apply it to the dataset.

In [34]:
df['city'] = cities
df['state_code'] = state_codes

Finally, I want to translate the state codes to their corresponding names. Thanks to ChatGPT for providing this information! 

In [35]:
usa_states = {
    'AL': 'Alabama',
    'AK': 'Alaska',
    'AZ': 'Arizona',
    'AR': 'Arkansas',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DE': 'Delaware',
    'FL': 'Florida',
    'GA': 'Georgia',
    'HI': 'Hawaii',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'IA': 'Iowa',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'ME': 'Maine',
    'MD': 'Maryland',
    'MA': 'Massachusetts',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MS': 'Mississippi',
    'MO': 'Missouri',
    'MT': 'Montana',
    'NE': 'Nebraska',
    'NV': 'Nevada',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NY': 'New York',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VT': 'Vermont',
    'VA': 'Virginia',
    'WA': 'Washington',
    'WV': 'West Virginia',
    'WI': 'Wisconsin',
    'WY': 'Wyoming'
}

In [36]:
df['state'] = [usa_states[i] for i in df.state_code]

In [37]:
df.sample(20)

Unnamed: 0,order_id,product,quantity_ordered,price_each,order_date,purchase_address,month_name,city,state_code,state
112695,249468.0,Wired Headphones,1,11.99,2019-09-25 11:12:00,"271 Adams St, Los Angeles, CA 90001",September,Los Angeles,CA,California
152456,287713.0,Wired Headphones,1,11.99,2019-11-11 16:59:00,"807 Walnut St, Los Angeles, CA 90001",November,Los Angeles,CA,California
12495,142336.0,20in Monitor,1,109.989998,2019-01-28 14:32:00,"571 Hill St, Portland, OR 97035",January,Portland,OR,Oregon
152784,288031.0,Bose SoundSport Headphones,1,99.989998,2019-11-16 08:16:00,"498 Forest St, Seattle, WA 98101",November,Seattle,WA,Washington
113834,250567.0,USB-C Charging Cable,1,11.95,2019-09-22 11:34:00,"940 Dogwood St, Atlanta, GA 30301",September,Atlanta,GA,Georgia
66154,204723.0,27in FHD Monitor,1,149.990005,2019-05-19 16:57:00,"938 Lincoln St, Boston, MA 02215",May,Boston,MA,Massachusetts
64080,202735.0,27in 4K Gaming Monitor,1,389.98999,2019-05-24 02:06:00,"30 Chestnut St, Portland, OR 97035",May,Portland,OR,Oregon
152558,287812.0,Wired Headphones,1,11.99,2019-11-22 19:43:00,"737 Lincoln St, Boston, MA 02215",November,Boston,MA,Massachusetts
170073,304626.0,Google Phone,1,600.0,2019-12-27 09:30:00,"68 7th St, New York City, NY 10001",December,New York City,NY,New York
42602,182163.0,Lightning Charging Cable,1,14.95,2019-07-04 23:25:00,"480 Meadow St, San Francisco, CA 94016",July,San Francisco,CA,California


## Are there issues with order_ID values?

In [38]:
df.order_id.sample(10)

163833    298635.0
99723     236997.0
109957    246841.0
118790    255335.0
120397    256898.0
153123    288355.0
130521    266650.0
60856     199654.0
96419     233806.0
26918     167072.0
Name: order_id, dtype: float64

Check length of the order_ID figures

In [39]:
df.order_id.apply(lambda x : len(str(x)) ).unique()

array([3, 7, 8], dtype=int64)

In [40]:
strange_order_id_values = [i for i in df.order_id if len(str(i)) == 3]
strange_order_id_values[:5]

[0.0, 0.0, 0.0, 0.0, 0.0]

I will set as `NaN` this order_id values

In [41]:
df.order_id = df.order_id.astype(float)

In [42]:
df.order_id = df.order_id.replace(0, np.nan)

How many missing order_id values?

In [43]:
df.order_id.isnull().sum()

9891

In [44]:
df[df.order_id.isnull()].sample(10)

Unnamed: 0,order_id,product,quantity_ordered,price_each,order_date,purchase_address,month_name,city,state_code,state
6479,,Lightning Charging Cable,1,14.95,2019-02-15 04:06:00,"260 6th St, San Francisco, CA 94016",February,San Francisco,CA,California
5568,,iPhone,1,700.0,2019-03-02 08:29:00,"251 Forest St, San Francisco, CA 94016",March,San Francisco,CA,California
2045,,AA Batteries (4-pack),1,3.84,2019-02-28 13:51:00,"739 4th St, New York City, NY 10001",February,New York City,NY,New York
9267,,Wired Headphones,1,11.99,2019-12-02 09:23:00,"677 River St, Seattle, WA 98101",December,Seattle,WA,Washington
9293,,Wired Headphones,1,11.99,2019-12-02 21:34:00,"403 Sunset St, Austin, TX 73301",December,Austin,TX,Texas
6311,,Lightning Charging Cable,1,14.95,2019-10-02 19:37:00,"574 Maple St, Atlanta, GA 30301",October,Atlanta,GA,Georgia
4961,,Flatscreen TV,1,300.0,2019-01-02 11:57:00,"693 11th St, San Francisco, CA 94016",January,San Francisco,CA,California
1601,,AA Batteries (4-pack),1,3.84,2019-12-02 09:05:00,"130 Hickory St, Portland, OR 97035",December,Portland,OR,Oregon
5196,,Flatscreen TV,1,300.0,2019-02-27 10:44:00,"651 Forest St, San Francisco, CA 94016",February,San Francisco,CA,California
8781,,USB-C Charging Cable,2,11.95,2019-02-25 20:11:00,"668 Maple St, Portland, OR 97035",February,Portland,OR,Oregon


In [45]:
df.isnull().sum()

order_id            9891
product                0
quantity_ordered       0
price_each             0
order_date             0
purchase_address       0
month_name             0
city                   0
state_code             0
state                  0
dtype: int64

These records seem to have only missing values in the order_id column. Since `order_id` is not a crucial information, I can leave them as it is.

# Last check before exportation

In [46]:
df.describe()

Unnamed: 0,order_id,quantity_ordered,price_each,order_date
count,175795.0,185686.0,185686.0,185686
mean,233429.545351,1.124544,184.519255,2019-07-13 04:25:30.597783552
min,15502.0,1.0,2.99,2019-01-01 03:07:00
25%,192955.5,1.0,11.95,2019-04-13 13:55:15
50%,235131.0,1.0,14.95,2019-07-12 08:03:30
75%,277405.5,1.0,150.0,2019-10-19 17:24:15
max,319670.0,9.0,1700.0,2020-01-01 05:13:00
std,53068.64761,0.443069,332.843838,


In [47]:
df.sample(15)

Unnamed: 0,order_id,product,quantity_ordered,price_each,order_date,purchase_address,month_name,city,state_code,state
183977,318018.0,AA Batteries (4-pack),2,3.84,2019-12-14 14:28:00,"157 Johnson St, Atlanta, GA 30301",December,Atlanta,GA,Georgia
57005,195962.0,20in Monitor,1,109.989998,2019-05-14 16:47:00,"884 Spruce St, Seattle, WA 98101",May,Seattle,WA,Washington
48124,187464.0,iPhone,1,700.0,2019-05-04 22:30:00,"460 Center St, Austin, TX 73301",May,Austin,TX,Texas
144677,280231.0,Bose SoundSport Headphones,1,99.989998,2019-07-11 20:04:00,"156 Lakeview St, San Francisco, CA 94016",July,San Francisco,CA,California
152639,287892.0,Flatscreen TV,1,300.0,2019-10-11 15:52:00,"517 Chestnut St, Seattle, WA 98101",October,Seattle,WA,Washington
171484,305982.0,Lightning Charging Cable,1,14.95,2019-12-12 13:43:00,"739 5th St, Austin, TX 73301",December,Austin,TX,Texas
91067,228647.0,Apple Airpods Headphones,1,150.0,2019-07-14 11:35:00,"572 13th St, Los Angeles, CA 90001",July,Los Angeles,CA,California
133209,269226.0,AA Batteries (4-pack),1,3.84,2019-12-10 02:01:00,"173 13th St, Portland, OR 97035",December,Portland,OR,Oregon
29338,169407.0,27in FHD Monitor,1,149.990005,2019-09-03 19:37:00,"210 Washington St, Boston, MA 02215",September,Boston,MA,Massachusetts
28847,168933.0,Apple Airpods Headphones,1,150.0,2019-03-24 12:11:00,"67 Church St, New York City, NY 10001",March,New York City,NY,New York


# Export the dataset to a csv file

In [48]:
df = df.sort_values('order_id')
df.to_csv('clean_dataset.csv', index=False)