In [1]:
import pandas as pd
import pytz
from datetime import datetime
from matplotlib import pyplot as plt

In [2]:
data = pd.read_csv('Sales-products-tz-mod.csv', index_col = 'SaleID')
sales_data = data.copy()
sales_data

Unnamed: 0_level_0,RetailerCountry,RetailerType,Product,Sales Revenue ($),DateOfSale,TimeOfSale,TimeZone
SaleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
SaleID_1,United States,Outdoors Shop,TrailChef Deluxe Cook Set,200.0,06/01/2020,23:20:56,EST
SaleID_2,United States,Outdoors Shop,TrailChef Double Flame,7.0,05/02/2020,17:27:08,EST
SaleID_3,United States,Outdoors Shop,Star Dome,20.0,30/10/2020,09:04:43,EST
SaleID_4,United States,Outdoors Shop,Star Gazer 2,40.0,13/11/2020,04:25:06,EST
SaleID_5,Italy,Outdoors Shop,Canyon Mule Carryall,150.5,06/12/2020,11:15:47,CET
...,...,...,...,...,...,...,...
SaleID_96,Australia,Sports Store,Mountain Man Extreme,24.0,23/07/2019,12:30:03,Australia/West
SaleID_97,Australia,Department Store,Firefly Mapreader,1200.0,03/01/2019,01:39:14,Australia/West
SaleID_98,Australia,Discount Retailer,Polar Sun,32.0,14/02/2019,21:29:35,Australia/West
SaleID_99,Australia,Discount Retailer,Polar Ice,18.0,19/06/2020,11:16:19,Australia/West


In [3]:
sales_data[["DateOfSale","TimeOfSale","TimeZone"]].head()

Unnamed: 0_level_0,DateOfSale,TimeOfSale,TimeZone
SaleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SaleID_1,06/01/2020,23:20:56,EST
SaleID_2,05/02/2020,17:27:08,EST
SaleID_3,30/10/2020,09:04:43,EST
SaleID_4,13/11/2020,04:25:06,EST
SaleID_5,06/12/2020,11:15:47,CET


## Part 1 - data preparation

- **Subtask 1:** Store the date and time values in a single column called 'MOS', denoting 'Moment of Sale'. Verify that the time zone values stored in "sales_data" are valid and can be manipulated with the pytz module. *(Data Preparation)*

In [16]:
date_and_time = sales_data["DateOfSale"] + " " + sales_data["TimeOfSale"]
date_and_time.head(10)

SaleID
SaleID_1     06/01/2020 23:20:56
SaleID_2     05/02/2020 17:27:08
SaleID_3     30/10/2020 09:04:43
SaleID_4     13/11/2020 04:25:06
SaleID_5     06/12/2020 11:15:47
SaleID_6     07/06/2020 01:54:41
SaleID_7     27/12/2020 02:07:28
SaleID_8     15/09/2020 11:44:57
SaleID_9     23/09/2020 01:40:29
SaleID_10    24/06/2020 18:43:04
dtype: object

In [19]:
sales_data["MOS"] = pd.to_datetime(date_and_time,format="mixed")
sales_data.head(10)

Unnamed: 0_level_0,RetailerCountry,RetailerType,Product,Sales Revenue ($),DateOfSale,TimeOfSale,TimeZone,MOS
SaleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SaleID_1,United States,Outdoors Shop,TrailChef Deluxe Cook Set,200.0,06/01/2020,23:20:56,EST,2020-06-01 23:20:56
SaleID_2,United States,Outdoors Shop,TrailChef Double Flame,7.0,05/02/2020,17:27:08,EST,2020-05-02 17:27:08
SaleID_3,United States,Outdoors Shop,Star Dome,20.0,30/10/2020,09:04:43,EST,2020-10-30 09:04:43
SaleID_4,United States,Outdoors Shop,Star Gazer 2,40.0,13/11/2020,04:25:06,EST,2020-11-13 04:25:06
SaleID_5,Italy,Outdoors Shop,Canyon Mule Carryall,150.5,06/12/2020,11:15:47,CET,2020-06-12 11:15:47
SaleID_6,Italy,Outdoors Shop,Firefly 4,1300.0,07/06/2020,01:54:41,CET,2020-07-06 01:54:41
SaleID_7,United Kingdom,Outdoors Shop,Husky Rope 50,270.0,27/12/2020,02:07:28,GMT,2020-12-27 02:07:28
SaleID_8,United Kingdom,Outdoors Shop,Granite Signal Mirror,499.99,15/09/2020,11:44:57,GMT,2020-09-15 11:44:57
SaleID_9,United Kingdom,Outdoors Shop,Granite Carabiner,32.0,23/09/2020,01:40:29,GMT,2020-09-23 01:40:29
SaleID_10,Italy,Outdoors Shop,Granite Grip,220.0,24/06/2020,18:43:04,CET,2020-06-24 18:43:04


In [5]:
# to verify that the timezones are valid and can be manipulated by pytz
sales_data["TimeZone"].unique() # SGT is the one that could cause troubles

array(['EST', 'CET', 'GMT', 'EET', 'SGT', 'Australia/West'], dtype=object)

In [12]:
# pytz.timezone("SGT") #==> causes an error
troublesom_timezones = set() #using a set to store just a single value
for i in sales_data["TimeZone"]:
  if i not in pytz.all_timezones:
    troublesom_timezones.add(i)
print(troublesom_timezones)

{'SGT'}


SGT is not in the pytz module, hence, it should be modified. First, DateOfSale and TimeOfSale are dropped to clean the dataframe

In [21]:
sales_data.drop(columns=["TimeOfSale","DateOfSale"],axis=1, inplace=True)
sales_data.head()

Unnamed: 0_level_0,RetailerCountry,RetailerType,Product,Sales Revenue ($),TimeZone,MOS
SaleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SaleID_1,United States,Outdoors Shop,TrailChef Deluxe Cook Set,200.0,EST,2020-06-01 23:20:56
SaleID_2,United States,Outdoors Shop,TrailChef Double Flame,7.0,EST,2020-05-02 17:27:08
SaleID_3,United States,Outdoors Shop,Star Dome,20.0,EST,2020-10-30 09:04:43
SaleID_4,United States,Outdoors Shop,Star Gazer 2,40.0,EST,2020-11-13 04:25:06
SaleID_5,Italy,Outdoors Shop,Canyon Mule Carryall,150.5,CET,2020-06-12 11:15:47


In [22]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, SaleID_1 to SaleID_100
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   RetailerCountry    100 non-null    object        
 1   RetailerType       100 non-null    object        
 2   Product            100 non-null    object        
 3   Sales Revenue ($)  100 non-null    float64       
 4   TimeZone           100 non-null    object        
 5   MOS                100 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 9.5+ KB
