In [1]:
import pandas as pd
import numpy as np
import time

# `ECOMPCTSA.csv`

In [2]:
data = pd.read_csv('data/raw/ECOMPCTSA.csv')

print(f"Shape is: {data.shape}")
data.head()

Shape is: (92, 2)


Unnamed: 0,DATE,ECOMPCTSA
0,1999-10-01,0.6
1,2000-01-01,0.8
2,2000-04-01,0.9
3,2000-07-01,1.0
4,2000-10-01,1.0


In [3]:
data.columns = [col.lower() for col in data.columns]
data.columns

Index(['date', 'ecompctsa'], dtype='object')

In [4]:
data["date"] = pd.to_datetime(data["date"])
data.dtypes

date         datetime64[ns]
ecompctsa           float64
dtype: object

In [5]:
data.to_csv('data/cleaned/ECOMPCTSA.csv', index=False)

# `fredgraph.xls`

In [6]:
excel_file = pd.read_excel('data/raw/fredgraph.xls', sheet_name=None)

data = []

# Loop through the sheets in the xls file
for sheet_name, df in excel_file.items():
    if sheet_name != 'FRED Graph':
        data.append(df)

In [7]:
data[0].columns = ['date', 'percentage_total_sales']
data[1].columns = ['date', 'sales_million_dollars']

In [8]:
data[0].to_csv('data/cleaned/ecommerce_sales_vs_total_sales.csv', index=False)
data[1].to_csv('data/cleaned/retail_sales.csv', index=False)

# `isoc_ec_esels_page_tabular.tsv`

In [9]:
data = pd.read_csv('data/raw/isoc_ec_esels_page_tabular.tsv', sep='\t')

print(f"Shape is: {data.shape}")
data.head()

Shape is: (40, 11)


Unnamed: 0,"freq,size_emp,nace_r2,indic_is,unit,geo\TIME_PERIOD",2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,"A,GE10,C10-S951_X_K,E_ESELL,PC_ENT,AT",12.6,13.3,14.6,15.3,17.2,14.4,19.6,22.3,23.0,21.8
1,"A,GE10,C10-S951_X_K,E_ESELL,PC_ENT,BA",:,:,:,:,:,21.5,20.7,19.2,20.1,23.5
2,"A,GE10,C10-S951_X_K,E_ESELL,PC_ENT,BE",20.7,22.9,24.6,23.9,24.3,28.8,29.6,25.5,30.6,28.3
3,"A,GE10,C10-S951_X_K,E_ESELL,PC_ENT,BG",4.8,5.6,5.8,5.4,7.3,5.7,7.4,8.3,10.5,11.5
4,"A,GE10,C10-S951_X_K,E_ESELL,PC_ENT,CY",7.3,10.5,10.6,12.7,11.7,12.2,12.7,14.8,17.1,20.1


In [10]:
data.rename(columns={'freq,size_emp,nace_r2,indic_is,unit,geo\TIME_PERIOD' : 'country_code'}, inplace=True)
data.head()

Unnamed: 0,country_code,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,"A,GE10,C10-S951_X_K,E_ESELL,PC_ENT,AT",12.6,13.3,14.6,15.3,17.2,14.4,19.6,22.3,23.0,21.8
1,"A,GE10,C10-S951_X_K,E_ESELL,PC_ENT,BA",:,:,:,:,:,21.5,20.7,19.2,20.1,23.5
2,"A,GE10,C10-S951_X_K,E_ESELL,PC_ENT,BE",20.7,22.9,24.6,23.9,24.3,28.8,29.6,25.5,30.6,28.3
3,"A,GE10,C10-S951_X_K,E_ESELL,PC_ENT,BG",4.8,5.6,5.8,5.4,7.3,5.7,7.4,8.3,10.5,11.5
4,"A,GE10,C10-S951_X_K,E_ESELL,PC_ENT,CY",7.3,10.5,10.6,12.7,11.7,12.2,12.7,14.8,17.1,20.1


In [11]:
def extract_country_code(cell):
    return cell.split(',')[-1]

data['country_code'] = data['country_code'].apply(extract_country_code)
data.head()

Unnamed: 0,country_code,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AT,12.6,13.3,14.6,15.3,17.2,14.4,19.6,22.3,23.0,21.8
1,BA,:,:,:,:,:,21.5,20.7,19.2,20.1,23.5
2,BE,20.7,22.9,24.6,23.9,24.3,28.8,29.6,25.5,30.6,28.3
3,BG,4.8,5.6,5.8,5.4,7.3,5.7,7.4,8.3,10.5,11.5
4,CY,7.3,10.5,10.6,12.7,11.7,12.2,12.7,14.8,17.1,20.1


In [12]:
def transform_col(cell):
    if ':' in cell:
        return None
    else:
        return cell


for col in data.columns:
    data[col] = data[col].apply(transform_col)

data.head()

Unnamed: 0,country_code,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AT,12.6,13.3,14.6,15.3,17.2,14.4,19.6,22.3,23.0,21.8
1,BA,,,,,,21.5,20.7,19.2,20.1,23.5
2,BE,20.7,22.9,24.6,23.9,24.3,28.8,29.6,25.5,30.6,28.3
3,BG,4.8,5.6,5.8,5.4,7.3,5.7,7.4,8.3,10.5,11.5
4,CY,7.3,10.5,10.6,12.7,11.7,12.2,12.7,14.8,17.1,20.1


In [15]:
data.to_csv('data/cleaned/isoc_ec_esels_page_tabular.csv', index=False)