In [110]:
import pandas as pd
import functions

df = pd.read_csv("../data/raw/ecommerce_sales.csv")

In [111]:
# Remove all empty columns and rows (all NaN values)
df_cleaned = df.dropna(axis=1, how='all')
df_cleaned = df.dropna(how='all')

df_cleaned

Unnamed: 0,DATAFLOW,LAST UPDATE,freq,size_emp,nace_r2,indic_is,unit,geo,TIME_PERIOD,OBS_VALUE,OBS_FLAG,CONF_STATUS
0,ESTAT:ISOC_EC_ESELS(1.0),12/12/24 11:00:00,Annual,From 1 to 9 persons employed,"All activities (except agriculture, forestry a...",Enterprises with e-commerce sales of at least ...,Percentage of enterprises,Germany,2010,21.98,,
1,ESTAT:ISOC_EC_ESELS(1.0),12/12/24 11:00:00,Annual,From 1 to 9 persons employed,"All activities (except agriculture, forestry a...",Enterprises with e-commerce sales of at least ...,Percentage of enterprises,Germany,2011,,u,
2,ESTAT:ISOC_EC_ESELS(1.0),12/12/24 11:00:00,Annual,From 1 to 9 persons employed,"All activities (except agriculture, forestry a...",Enterprises with e-commerce sales of at least ...,Percentage of enterprises,Germany,2012,15.27,,
3,ESTAT:ISOC_EC_ESELS(1.0),12/12/24 11:00:00,Annual,From 1 to 9 persons employed,"All activities (except agriculture, forestry a...",Enterprises with e-commerce sales of at least ...,Percentage of enterprises,Germany,2013,16.89,,
4,ESTAT:ISOC_EC_ESELS(1.0),12/12/24 11:00:00,Annual,From 1 to 9 persons employed,"All activities (except agriculture, forestry a...",Enterprises with e-commerce sales of at least ...,Percentage of enterprises,Germany,2014,15.12,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2070,ESTAT:ISOC_EC_ESELS(1.0),12/12/24 11:00:00,Annual,250 persons employed or more,"All activities (except agriculture, forestry a...",Enterprises with e-commerce sales of at least ...,Percentage of enterprises,United Kingdom,2016,41.67,,
2071,ESTAT:ISOC_EC_ESELS(1.0),12/12/24 11:00:00,Annual,250 persons employed or more,"All activities (except agriculture, forestry a...",Enterprises with e-commerce sales of at least ...,Percentage of enterprises,United Kingdom,2017,42.64,,
2072,ESTAT:ISOC_EC_ESELS(1.0),12/12/24 11:00:00,Annual,250 persons employed or more,"All activities (except agriculture, forestry a...",Enterprises with e-commerce sales of at least ...,Percentage of enterprises,United Kingdom,2018,41.17,,
2073,ESTAT:ISOC_EC_ESELS(1.0),12/12/24 11:00:00,Annual,250 persons employed or more,"All activities (except agriculture, forestry a...",Enterprises with e-commerce sales of at least ...,Percentage of enterprises,United Kingdom,2019,45.04,,


In [112]:
# Clean Columns
# print(df_cleaned.columns)

df_cleaned = df_cleaned.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'nace_r2', 'indic_is',
       'unit', 'OBS_FLAG', 'CONF_STATUS'])

df_cleaned.rename(columns={"geo": "country", "TIME_PERIOD": "year", "OBS_VALUE": "sales"}, inplace=True)

df_cleaned


Unnamed: 0,size_emp,country,year,sales
0,From 1 to 9 persons employed,Germany,2010,21.98
1,From 1 to 9 persons employed,Germany,2011,
2,From 1 to 9 persons employed,Germany,2012,15.27
3,From 1 to 9 persons employed,Germany,2013,16.89
4,From 1 to 9 persons employed,Germany,2014,15.12
...,...,...,...,...
2070,250 persons employed or more,United Kingdom,2016,41.67
2071,250 persons employed or more,United Kingdom,2017,42.64
2072,250 persons employed or more,United Kingdom,2018,41.17
2073,250 persons employed or more,United Kingdom,2019,45.04


In [113]:
# Shortens the size_emp values
size_map = {
    "From 1 to 9 persons employed": "A(1-9)",
    "From 10 to 49 persons employed": "B(10-49)",
    "From 50 to 249 persons employed": "C(50-249)",
    "250 persons employed or more": "D(>=250)",
    "10 persons employed or more": "E(>=10)"
}
df_cleaned["size_emp"] = df["size_emp"].apply(lambda x: size_map[x])

In [114]:
# Filter only European Union
df_euro_all = df_cleaned[(df_cleaned['country'].str.contains("Euro area")) & (df_cleaned['size_emp'] == "E(>=10)")]
df_euro_all = df_euro_all.replace(to_replace='Euro area (EA11-1999, EA12-2001, EA13-2007, EA15-2008, EA16-2009, EA17-2011, EA18-2014, EA19-2015, EA20-2023)', value="Euro Area")
df_euro_all

Unnamed: 0,size_emp,country,year,sales
1165,E(>=10),Euro Area,2010,13.57
1166,E(>=10),Euro Area,2011,12.75
1167,E(>=10),Euro Area,2012,13.65
1168,E(>=10),Euro Area,2013,13.78
1169,E(>=10),Euro Area,2014,14.59
1170,E(>=10),Euro Area,2015,16.87
1171,E(>=10),Euro Area,2016,18.4
1172,E(>=10),Euro Area,2017,18.18
1173,E(>=10),Euro Area,2018,16.98
1174,E(>=10),Euro Area,2019,16.96


In [115]:
# Filter out "Euro area" and "European Union"
df_cleaned = df_cleaned[~df_cleaned['country'].str.contains("Euro")]

# df_cleaned["country"].unique() # Checks 

In [116]:
df_cleaned = functions.region_mapping(df_cleaned)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['region'] = df['country'].map(country_to_region)


In [117]:
# Creates two different dataframes
df_sales_cat = df_cleaned[(df_cleaned["size_emp"] != "E(>=10)") & (df_cleaned["size_emp"] != "A(1-9)")] # Dataframe with enterprises divided in four sizes categories
df_sales_all = df_cleaned[df_cleaned["size_emp"] == "E(>=10)"] # Dataframe with enterprises in one categoriy encompasing all 4 previous categories

In [118]:
# Reset indexes
df_sales_cat.reset_index(drop=True, inplace=True)

df_sales_all.reset_index(drop=True, inplace=True)
df_sales_all.drop(columns=["size_emp"], inplace=True) # Delete size column as there is only one size in DF

df_sales_all.head(5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sales_all.drop(columns=["size_emp"], inplace=True) # Delete size column as there is only one size in DF


Unnamed: 0,country,year,sales,region
0,Austria,2010,13.86,Western Europe
1,Austria,2011,11.1,Western Europe
2,Austria,2012,10.98,Western Europe
3,Austria,2013,12.58,Western Europe
4,Austria,2014,13.27,Western Europe


In [119]:
# create a dummy function to send to the exploration notebook
def import_df_ecommerce():
    return (
        df_sales_cat,
        df_sales_all,
        df_euro_all
        )