# ANALYZE SALES

In [1]:
# Load all the needed packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import re
import datetime

In [2]:
transactions = pd.read_csv("datasets/transactions.csv")
customers = pd.read_csv("datasets/customers.csv")
products = pd.read_csv("datasets/products.csv")

In [3]:
transactions.head()

Unnamed: 0,id_prod,date,session_id,client_id
0,0_1483,2021-04-10 18:37:28.723910,s_18746,c_4450
1,2_226,2022-02-03 01:55:53.276402,s_159142,c_277
2,1_374,2021-09-23 15:13:46.938559,s_94290,c_4270
3,0_2186,2021-10-17 03:27:18.783634,s_105936,c_4597
4,0_1351,2021-07-17 20:34:25.800563,s_63642,c_1242


In [4]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337016 entries, 0 to 337015
Data columns (total 4 columns):
id_prod       337016 non-null object
date          337016 non-null object
session_id    337016 non-null object
client_id     337016 non-null object
dtypes: object(4)
memory usage: 10.3+ MB


In [5]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8623 entries, 0 to 8622
Data columns (total 3 columns):
client_id    8623 non-null object
sex          8623 non-null object
birth        8623 non-null int64
dtypes: int64(1), object(2)
memory usage: 202.2+ KB


In [6]:
products.head()

Unnamed: 0,id_prod,price,categ
0,0_1421,19.99,0
1,0_1368,5.13,0
2,0_731,17.99,0
3,1_587,4.99,1
4,0_1507,3.99,0


In [7]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3287 entries, 0 to 3286
Data columns (total 3 columns):
id_prod    3287 non-null object
price      3287 non-null float64
categ      3287 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 77.2+ KB


## Cleaning all datasets

### Cleaning transactions dataset  
#### Define

- **Some date starting with test must be split**
- **Date variable must be a *datetime* not a string**  
- **We must split id_prod in category and id_prod**
- **We must split session_id in session id and session_category**
- **We must split client_id in client_id and client_category**  

#### Code

In [8]:
# Write  a function to split a column
def split_columns(dataset, col):
    ### This function take a dataset and a column of the dataset split the column and the 2 new columns
    new_col_1 = dataset[col].map(lambda x : x.split("_")[1])
    new_col_2 = dataset[col].map(lambda x : x.split("_")[0].upper())
    return new_col_1, new_col_2

In [9]:
# Create a copy of transactions dataset
transactions_clean = transactions.copy()

In [10]:
# Split id_prod in 2 columns,id_prod and category
transactions_clean["id_prod"], transactions_clean["category"] = split_columns(transactions_clean, "id_prod")

In [11]:
# Split client_id columns into 2 columns, client_id and client_category
transactions_clean["client_id"], transactions_clean["client_category"] = split_columns(transactions_clean, "client_id")

In [12]:
# Split session_id in 2 columns, session_id and sesseion_category
transactions_clean["session_id"], transactions_clean["session_category"] = split_columns(transactions_clean, "session_id")

In [13]:
# Check if everything is ok
transactions_clean.head()

Unnamed: 0,id_prod,date,session_id,client_id,category,client_category,session_category
0,1483,2021-04-10 18:37:28.723910,18746,4450,0,C,S
1,226,2022-02-03 01:55:53.276402,159142,277,2,C,S
2,374,2021-09-23 15:13:46.938559,94290,4270,1,C,S
3,2186,2021-10-17 03:27:18.783634,105936,4597,0,C,S
4,1351,2021-07-17 20:34:25.800563,63642,1242,0,C,S


In [14]:
# Check the different categories
transactions_clean.query("category == 'T'")

Unnamed: 0,id_prod,date,session_id,client_id,category,client_category,session_category
1431,0,test_2021-03-01 02:30:02.237420,0,1,T,CT,S
2365,0,test_2021-03-01 02:30:02.237446,0,1,T,CT,S
2895,0,test_2021-03-01 02:30:02.237414,0,1,T,CT,S
5955,0,test_2021-03-01 02:30:02.237441,0,0,T,CT,S
7283,0,test_2021-03-01 02:30:02.237434,0,1,T,CT,S
13745,0,test_2021-03-01 02:30:02.237443,0,0,T,CT,S
20470,0,test_2021-03-01 02:30:02.237442,0,0,T,CT,S
22347,0,test_2021-03-01 02:30:02.237412,0,1,T,CT,S
26359,0,test_2021-03-01 02:30:02.237439,0,1,T,CT,S
26407,0,test_2021-03-01 02:30:02.237426,0,0,T,CT,S


There are 200 rows which date starts with test. We can guess that it was just to *test* if the system is working or not. These rows are not useful for our analysis. We will remove them.
We can therefore notice that the test day was on 2021-03-01 at 02:30:02 am.

In [15]:
# Remove all the test dates
transactions_clean = transactions_clean.query("category != 'T'")

In [16]:
# Check if there are still test date, no output means there is no test date anymore
assert transactions_clean.category.all() != "T"

In [20]:
transactions_clean.date = transactions_clean.date.astype("datetime64")

In [30]:
# Assert that the date is in the correct type
transactions_clean.date.head()

0   2021-04-10 18:37:28.723910
1   2022-02-03 01:55:53.276402
2   2021-09-23 15:13:46.938559
3   2021-10-17 03:27:18.783634
4   2021-07-17 20:34:25.800563
Name: date, dtype: datetime64[ns]