# Importar las clases

In [1]:
from classes import Extract, Transform, Load, DatabaseConnection

In [2]:
import pandas as pd

# Importar la conexión a la base de datos y el loader para subir datos a nuestra DatawareHouse

In [3]:
db = DatabaseConnection()

In [4]:
db_url = "postgresql+psycopg2://root:root@localhost:5433/bodegaDeDatos"

In [5]:
Load.to_database.__doc__

'\n        Load the dataframe to the databse.\n\n        :param df: DataFrame to load the data.\n        :param table_name: Name of the table to load.\n        '

In [6]:
loader = Load(db_url)

In [7]:
conn = db.get_connection()

# Creamos un extractor a partir de nuestras clases; como es genérico, podemos usarlo varias veces

In [8]:
Extract.extract.__doc__

'Extracts specified columns from a table.\n\n        Args:\n            table_name (str): The name of the table.\n            columns (List[str]): A list of column names to extract.\n\n        Returns:\n            List[tuple]: A list of tuples containing the extracted data.\n        '

In [9]:
extractor = Extract(conn = conn)

# Extraer el género del cliente y traducirlo al español

In [109]:
h, data = extractor.extract('client', ['gender'])

In [110]:
df_gender =  pd.DataFrame(data, columns = h)

In [111]:
df_gender

Unnamed: 0,gender
0,Female
1,Male


In [112]:
transformer = Transform(df_gender)

In [113]:
df_gender = transformer.gender()

In [114]:
df_gender

Unnamed: 0,genero,description
0,Mujer,Persona del sexo femenino
1,Hombre,Persona del sexo masculino


In [17]:
loader.to_database(df_gender, "genero_dimension", index_name='id_genero')

Datos cargados exitosamente en la tabla genero_dimension


# Extraer la tarjeta desde la tabla de tarjeta

In [115]:
h, data = extractor.extract('card', ['card'])

In [116]:
df_card = pd.DataFrame(data, columns=h)

In [117]:
df_card

Unnamed: 0,card
0,diners-club-us-ca
1,instapayment
2,china-unionpay
3,bankcard
4,diners-club-carte-blanche
5,visa
6,diners-club-international
7,americanexpress
8,laser
9,maestro


In [22]:
loader.to_database(df_card, "card_dimension", index_name = "id_card")

Datos cargados exitosamente en la tabla card_dimension


# Extraer el país del cliente y traducirlo al español

In [118]:
h, data = extractor.extract('client', ['country'])

In [119]:
df_country = pd.DataFrame(data, columns=h)

In [120]:
df_country

Unnamed: 0,country
0,Afghanistan
1,Aland Islands
2,Albania
3,Algeria
4,American Samoa
...,...
209,Vietnam
210,Western Sahara
211,Yemen
212,Zambia


In [121]:
transformer_country = Transform(df_country)

In [122]:
df_country_translated = transformer_country.transform_country()

DataFrame original:
            country
0       Afghanistan
1     Aland Islands
2           Albania
3           Algeria
4    American Samoa
..              ...
209         Vietnam
210  Western Sahara
211           Yemen
212          Zambia
213        Zimbabwe

[214 rows x 1 columns]
Procesando país: Afghanistan
Datos obtenidos para Afghanistan: Afganistán
Procesando país: Aland Islands
No se encontraron datos para Aland Islands
Procesando país: Albania
Datos obtenidos para Albania: Albania
Procesando país: Algeria
Datos obtenidos para Algeria: Argelia
Procesando país: American Samoa
Datos obtenidos para American Samoa: Samoa Americana
Procesando país: Andorra
Datos obtenidos para Andorra: Andorra
Procesando país: Angola
Datos obtenidos para Angola: Angola
Procesando país: Anguilla
Datos obtenidos para Anguilla: Anguilla
Procesando país: Antigua and Barbuda
Datos obtenidos para Antigua and Barbuda: Antigua y Barbuda
Procesando país: Argentina
Datos obtenidos para Argentina: Argentina
Pr

In [29]:
loader.to_database(df_country, "country_dimension", "id_country")

Datos cargados exitosamente en la tabla country_dimension


# Extraer el producto desde la entidad de producto

In [123]:
h, data = extractor.extract('product', ['product'])

In [124]:
df_product = pd.DataFrame(data, columns=h)

In [125]:
df_product

Unnamed: 0,product
0,Marzipan 50/50
1,"Bread - Roll, Whole Wheat"
2,Cilantro / Coriander - Fresh
3,Sauce Tomato Pouch
4,Mahi Mahi
...,...
2016,"Crush - Orange, 355ml"
2017,Bread - Hot Dog Buns
2018,Cheese - Bakers Cream Cheese
2019,Wine - Riesling Dr. Pauly


In [33]:
loader.to_database(df_product, "product_dimension", "id_product")

Datos cargados exitosamente en la tabla product_dimension


# Extraer year, day, month

In [126]:
years_df, months_df, weekdays_df = extractor.extract_date_components("sale", "date_sale")

In [127]:
transformer_years = Transform(years_df)
transformer_months = Transform(months_df)
transformer_weekdays = Transform(weekdays_df)

transformed_years = transformer_years.transform_years()
transformed_months = transformer_months.transform_months()
transformed_weekdays = transformer_weekdays.transform_weekdays()

Transforming years DataFrame:
    year
0   2000
1   2001
2   2002
3   2003
4   2004
5   2005
6   2006
7   2007
8   2008
9   2009
10  2010
11  2011
12  2012
13  2013
14  2014
15  2015
16  2016
17  2017
18  2018
19  2019
20  2020
Transforming months DataFrame:
    month
0       1
1       2
2       3
3       4
4       5
5       6
6       7
7       8
8       9
9      10
10     11
11     12
Transformed months DataFrame:
    month_name
0        Enero
1      Febrero
2        Marzo
3        Abril
4         Mayo
5        Junio
6        Julio
7       Agosto
8   Septiembre
9      Octubre
10   Noviembre
11   Diciembre
Transforming weekdays DataFrame:
            date
0     2016-09-27
1     2013-02-11
2     2000-09-30
3     2010-10-11
4     2007-01-18
...          ...
4915  2020-07-08
4916  2012-08-22
4917  2019-10-15
4918  2007-10-21
4919  2015-06-13

[4920 rows x 1 columns]
Transformed weekdays DataFrame:
   weekday_name
8       Domingo
4        Jueves
1         Lunes
0        Martes
9     Miérco

In [36]:
loader.to_database(transformed_years, 'year_dimension', 'id_year')

Datos cargados exitosamente en la tabla year_dimension


In [37]:
loader.to_database(transformed_months, 'meses_dimension', 'id_meses')

Datos cargados exitosamente en la tabla meses_dimension


In [38]:
loader.to_database(transformed_weekdays, 'dias_dimension', 'id_dias')

Datos cargados exitosamente en la tabla dias_dimension


# Ej. 1

In [187]:
stats_df = extractor.extract_statistics("sale")

In [188]:
stats_df.head()

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid
0,Afghanistan,Female,Alize Gold Passion,31,2000,7,1,61454.52,61454.52,61454.52,,61454.52
1,Afghanistan,Female,Almonds Ground Blanched,7,2005,12,1,36863.95,36863.95,36863.95,,36863.95
2,Afghanistan,Female,Almonds Ground Blanched,28,2007,1,1,60404.92,60404.92,60404.92,,60404.92
3,Afghanistan,Female,Appetizer - Chicken Satay,2,2016,8,1,86835.0,86835.0,86835.0,,86835.0
4,Afghanistan,Female,Apple - Macintosh,2,2016,8,1,86835.0,86835.0,86835.0,,86835.0


In [189]:
facts_df = stats_df.copy()

In [190]:
facts_df

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid
0,Afghanistan,Female,Alize Gold Passion,31,2000,7,1,61454.52,61454.52,61454.52,,61454.520000000000
1,Afghanistan,Female,Almonds Ground Blanched,7,2005,12,1,36863.95,36863.95,36863.95,,36863.950000000000
2,Afghanistan,Female,Almonds Ground Blanched,28,2007,1,1,60404.92,60404.92,60404.92,,60404.920000000000
3,Afghanistan,Female,Appetizer - Chicken Satay,2,2016,8,1,86835.00,86835.00,86835.00,,86835.000000000000
4,Afghanistan,Female,Apple - Macintosh,2,2016,8,1,86835.00,86835.00,86835.00,,86835.000000000000
...,...,...,...,...,...,...,...,...,...,...,...,...
63591,Zimbabwe,Female,Sesame Seed Black,28,2011,7,2,154296.76,77148.38,77148.38,0,77148.380000000000
63592,Zimbabwe,Female,Sobe - Liz Blizz,22,2011,8,1,64537.12,64537.12,64537.12,,64537.120000000000
63593,Zimbabwe,Female,Spinach - Spinach Leaf,28,2011,7,1,77148.38,77148.38,77148.38,,77148.380000000000
63594,Zimbabwe,Female,Uniform Linen Charge,28,2011,7,1,77148.38,77148.38,77148.38,,77148.380000000000


In [191]:
facts_df['gender'] = facts_df['gender'].map({'Male':'Hombre', 'Female':'Mujer'})

In [192]:
facts_df

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid
0,Afghanistan,Mujer,Alize Gold Passion,31,2000,7,1,61454.52,61454.52,61454.52,,61454.520000000000
1,Afghanistan,Mujer,Almonds Ground Blanched,7,2005,12,1,36863.95,36863.95,36863.95,,36863.950000000000
2,Afghanistan,Mujer,Almonds Ground Blanched,28,2007,1,1,60404.92,60404.92,60404.92,,60404.920000000000
3,Afghanistan,Mujer,Appetizer - Chicken Satay,2,2016,8,1,86835.00,86835.00,86835.00,,86835.000000000000
4,Afghanistan,Mujer,Apple - Macintosh,2,2016,8,1,86835.00,86835.00,86835.00,,86835.000000000000
...,...,...,...,...,...,...,...,...,...,...,...,...
63591,Zimbabwe,Mujer,Sesame Seed Black,28,2011,7,2,154296.76,77148.38,77148.38,0,77148.380000000000
63592,Zimbabwe,Mujer,Sobe - Liz Blizz,22,2011,8,1,64537.12,64537.12,64537.12,,64537.120000000000
63593,Zimbabwe,Mujer,Spinach - Spinach Leaf,28,2011,7,1,77148.38,77148.38,77148.38,,77148.380000000000
63594,Zimbabwe,Mujer,Uniform Linen Charge,28,2011,7,1,77148.38,77148.38,77148.38,,77148.380000000000


In [193]:
dias_names = {
    0: "Lunes", 1: "Martes", 2: "Miércoles", 3: "Jueves", 4: "Viernes",
    5: "Sábado", 6: "Domingo"
}

In [194]:
facts_df["fecha_original"] = pd.to_datetime(facts_df[["year", "month", "day"]])

In [195]:
facts_df["fecha_original"] = pd.to_datetime(facts_df["fecha_original"], errors="coerce")

In [196]:
facts_df

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid,fecha_original
0,Afghanistan,Mujer,Alize Gold Passion,31,2000,7,1,61454.52,61454.52,61454.52,,61454.520000000000,2000-07-31
1,Afghanistan,Mujer,Almonds Ground Blanched,7,2005,12,1,36863.95,36863.95,36863.95,,36863.950000000000,2005-12-07
2,Afghanistan,Mujer,Almonds Ground Blanched,28,2007,1,1,60404.92,60404.92,60404.92,,60404.920000000000,2007-01-28
3,Afghanistan,Mujer,Appetizer - Chicken Satay,2,2016,8,1,86835.00,86835.00,86835.00,,86835.000000000000,2016-08-02
4,Afghanistan,Mujer,Apple - Macintosh,2,2016,8,1,86835.00,86835.00,86835.00,,86835.000000000000,2016-08-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63591,Zimbabwe,Mujer,Sesame Seed Black,28,2011,7,2,154296.76,77148.38,77148.38,0,77148.380000000000,2011-07-28
63592,Zimbabwe,Mujer,Sobe - Liz Blizz,22,2011,8,1,64537.12,64537.12,64537.12,,64537.120000000000,2011-08-22
63593,Zimbabwe,Mujer,Spinach - Spinach Leaf,28,2011,7,1,77148.38,77148.38,77148.38,,77148.380000000000,2011-07-28
63594,Zimbabwe,Mujer,Uniform Linen Charge,28,2011,7,1,77148.38,77148.38,77148.38,,77148.380000000000,2011-07-28


In [197]:
facts_df["day"] = facts_df["fecha_original"].apply(lambda x: x.weekday())

In [198]:
facts_df["day"] = facts_df["day"].map(dias_names)

In [199]:
facts_df

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid,fecha_original
0,Afghanistan,Mujer,Alize Gold Passion,Lunes,2000,7,1,61454.52,61454.52,61454.52,,61454.520000000000,2000-07-31
1,Afghanistan,Mujer,Almonds Ground Blanched,Miércoles,2005,12,1,36863.95,36863.95,36863.95,,36863.950000000000,2005-12-07
2,Afghanistan,Mujer,Almonds Ground Blanched,Domingo,2007,1,1,60404.92,60404.92,60404.92,,60404.920000000000,2007-01-28
3,Afghanistan,Mujer,Appetizer - Chicken Satay,Martes,2016,8,1,86835.00,86835.00,86835.00,,86835.000000000000,2016-08-02
4,Afghanistan,Mujer,Apple - Macintosh,Martes,2016,8,1,86835.00,86835.00,86835.00,,86835.000000000000,2016-08-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63591,Zimbabwe,Mujer,Sesame Seed Black,Jueves,2011,7,2,154296.76,77148.38,77148.38,0,77148.380000000000,2011-07-28
63592,Zimbabwe,Mujer,Sobe - Liz Blizz,Lunes,2011,8,1,64537.12,64537.12,64537.12,,64537.120000000000,2011-08-22
63593,Zimbabwe,Mujer,Spinach - Spinach Leaf,Jueves,2011,7,1,77148.38,77148.38,77148.38,,77148.380000000000,2011-07-28
63594,Zimbabwe,Mujer,Uniform Linen Charge,Jueves,2011,7,1,77148.38,77148.38,77148.38,,77148.380000000000,2011-07-28


In [200]:
month_names = {
            1: "Enero", 2: "Febrero", 3: "Marzo", 4: "Abril", 5: "Mayo", 6: "Junio",
            7: "Julio", 8: "Agosto", 9: "Septiembre", 10: "Octubre", 11: "Noviembre", 12: "Diciembre"
        }

In [201]:
facts_df['month'] = facts_df['month'].map(month_names)

In [202]:
facts_df

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid,fecha_original
0,Afghanistan,Mujer,Alize Gold Passion,Lunes,2000,Julio,1,61454.52,61454.52,61454.52,,61454.520000000000,2000-07-31
1,Afghanistan,Mujer,Almonds Ground Blanched,Miércoles,2005,Diciembre,1,36863.95,36863.95,36863.95,,36863.950000000000,2005-12-07
2,Afghanistan,Mujer,Almonds Ground Blanched,Domingo,2007,Enero,1,60404.92,60404.92,60404.92,,60404.920000000000,2007-01-28
3,Afghanistan,Mujer,Appetizer - Chicken Satay,Martes,2016,Agosto,1,86835.00,86835.00,86835.00,,86835.000000000000,2016-08-02
4,Afghanistan,Mujer,Apple - Macintosh,Martes,2016,Agosto,1,86835.00,86835.00,86835.00,,86835.000000000000,2016-08-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63591,Zimbabwe,Mujer,Sesame Seed Black,Jueves,2011,Julio,2,154296.76,77148.38,77148.38,0,77148.380000000000,2011-07-28
63592,Zimbabwe,Mujer,Sobe - Liz Blizz,Lunes,2011,Agosto,1,64537.12,64537.12,64537.12,,64537.120000000000,2011-08-22
63593,Zimbabwe,Mujer,Spinach - Spinach Leaf,Jueves,2011,Julio,1,77148.38,77148.38,77148.38,,77148.380000000000,2011-07-28
63594,Zimbabwe,Mujer,Uniform Linen Charge,Jueves,2011,Julio,1,77148.38,77148.38,77148.38,,77148.380000000000,2011-07-28


In [96]:
unique_paises = df_country['country'].unique()

In [98]:
api_helper = Transform(df_country)

In [99]:
reverse_country = {}

In [102]:
for pais in unique_paises:
    data = api_helper.get_country_info(pais)
    if data:
        english_name = data.get('name', {}).get('common', pais)
        reverse_country[pais] = english_name
    else:
        reverse_country[pais] = pais

Datos obtenidos para Afghanistan: Afganistán
No se encontraron datos para Aland Islands
Datos obtenidos para Albania: Albania
Datos obtenidos para Algeria: Argelia
Datos obtenidos para American Samoa: Samoa Americana
Datos obtenidos para Andorra: Andorra
Datos obtenidos para Angola: Angola
Datos obtenidos para Anguilla: Anguilla
Datos obtenidos para Antigua and Barbuda: Antigua y Barbuda
Datos obtenidos para Argentina: Argentina
Datos obtenidos para Armenia: Armenia
Datos obtenidos para Aruba: Aruba
Datos obtenidos para Australia: Australia
Datos obtenidos para Austria: Austria
Datos obtenidos para Azerbaijan: Azerbaiyán
Datos obtenidos para Bahamas: Bahamas
Datos obtenidos para Bahrain: Bahrein
Datos obtenidos para Bangladesh: Bangladesh
Datos obtenidos para Barbados: Barbados
Datos obtenidos para Belarus: Bielorrusia
Datos obtenidos para Belgium: Bélgica
Datos obtenidos para Belize: Belice
Datos obtenidos para Benin: Benín
Datos obtenidos para Bermuda: Bermudas
Datos obtenidos para B

In [203]:
facts_df['country'] = facts_df['country'].map(reverse_country)

In [204]:
facts_df

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid,fecha_original
0,Afghanistan,Mujer,Alize Gold Passion,Lunes,2000,Julio,1,61454.52,61454.52,61454.52,,61454.520000000000,2000-07-31
1,Afghanistan,Mujer,Almonds Ground Blanched,Miércoles,2005,Diciembre,1,36863.95,36863.95,36863.95,,36863.950000000000,2005-12-07
2,Afghanistan,Mujer,Almonds Ground Blanched,Domingo,2007,Enero,1,60404.92,60404.92,60404.92,,60404.920000000000,2007-01-28
3,Afghanistan,Mujer,Appetizer - Chicken Satay,Martes,2016,Agosto,1,86835.00,86835.00,86835.00,,86835.000000000000,2016-08-02
4,Afghanistan,Mujer,Apple - Macintosh,Martes,2016,Agosto,1,86835.00,86835.00,86835.00,,86835.000000000000,2016-08-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63591,Zimbabwe,Mujer,Sesame Seed Black,Jueves,2011,Julio,2,154296.76,77148.38,77148.38,0,77148.380000000000,2011-07-28
63592,Zimbabwe,Mujer,Sobe - Liz Blizz,Lunes,2011,Agosto,1,64537.12,64537.12,64537.12,,64537.120000000000,2011-08-22
63593,Zimbabwe,Mujer,Spinach - Spinach Leaf,Jueves,2011,Julio,1,77148.38,77148.38,77148.38,,77148.380000000000,2011-07-28
63594,Zimbabwe,Mujer,Uniform Linen Charge,Jueves,2011,Julio,1,77148.38,77148.38,77148.38,,77148.380000000000,2011-07-28


In [205]:
facts_df.drop(columns=["fecha_original"], inplace=True)

In [206]:
facts_df

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid
0,Afghanistan,Mujer,Alize Gold Passion,Lunes,2000,Julio,1,61454.52,61454.52,61454.52,,61454.520000000000
1,Afghanistan,Mujer,Almonds Ground Blanched,Miércoles,2005,Diciembre,1,36863.95,36863.95,36863.95,,36863.950000000000
2,Afghanistan,Mujer,Almonds Ground Blanched,Domingo,2007,Enero,1,60404.92,60404.92,60404.92,,60404.920000000000
3,Afghanistan,Mujer,Appetizer - Chicken Satay,Martes,2016,Agosto,1,86835.00,86835.00,86835.00,,86835.000000000000
4,Afghanistan,Mujer,Apple - Macintosh,Martes,2016,Agosto,1,86835.00,86835.00,86835.00,,86835.000000000000
...,...,...,...,...,...,...,...,...,...,...,...,...
63591,Zimbabwe,Mujer,Sesame Seed Black,Jueves,2011,Julio,2,154296.76,77148.38,77148.38,0,77148.380000000000
63592,Zimbabwe,Mujer,Sobe - Liz Blizz,Lunes,2011,Agosto,1,64537.12,64537.12,64537.12,,64537.120000000000
63593,Zimbabwe,Mujer,Spinach - Spinach Leaf,Jueves,2011,Julio,1,77148.38,77148.38,77148.38,,77148.380000000000
63594,Zimbabwe,Mujer,Uniform Linen Charge,Jueves,2011,Julio,1,77148.38,77148.38,77148.38,,77148.380000000000


In [207]:
facts_df['country'] = facts_df['country'].map(dict(zip(df_country['country'], df_country.index)))

In [208]:
facts_df['country'] = facts_df['country'].fillna(-1).astype(int)

In [209]:
facts_df

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid
0,0,Mujer,Alize Gold Passion,Lunes,2000,Julio,1,61454.52,61454.52,61454.52,,61454.520000000000
1,0,Mujer,Almonds Ground Blanched,Miércoles,2005,Diciembre,1,36863.95,36863.95,36863.95,,36863.950000000000
2,0,Mujer,Almonds Ground Blanched,Domingo,2007,Enero,1,60404.92,60404.92,60404.92,,60404.920000000000
3,0,Mujer,Appetizer - Chicken Satay,Martes,2016,Agosto,1,86835.00,86835.00,86835.00,,86835.000000000000
4,0,Mujer,Apple - Macintosh,Martes,2016,Agosto,1,86835.00,86835.00,86835.00,,86835.000000000000
...,...,...,...,...,...,...,...,...,...,...,...,...
63591,213,Mujer,Sesame Seed Black,Jueves,2011,Julio,2,154296.76,77148.38,77148.38,0,77148.380000000000
63592,213,Mujer,Sobe - Liz Blizz,Lunes,2011,Agosto,1,64537.12,64537.12,64537.12,,64537.120000000000
63593,213,Mujer,Spinach - Spinach Leaf,Jueves,2011,Julio,1,77148.38,77148.38,77148.38,,77148.380000000000
63594,213,Mujer,Uniform Linen Charge,Jueves,2011,Julio,1,77148.38,77148.38,77148.38,,77148.380000000000


In [210]:
facts_df['gender'] = facts_df['gender'].map(dict(zip(df_gender['genero'], df_gender.index)))
facts_df['gender'] = facts_df['gender'].fillna(-1).astype(int)

In [211]:
facts_df

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid
0,0,0,Alize Gold Passion,Lunes,2000,Julio,1,61454.52,61454.52,61454.52,,61454.520000000000
1,0,0,Almonds Ground Blanched,Miércoles,2005,Diciembre,1,36863.95,36863.95,36863.95,,36863.950000000000
2,0,0,Almonds Ground Blanched,Domingo,2007,Enero,1,60404.92,60404.92,60404.92,,60404.920000000000
3,0,0,Appetizer - Chicken Satay,Martes,2016,Agosto,1,86835.00,86835.00,86835.00,,86835.000000000000
4,0,0,Apple - Macintosh,Martes,2016,Agosto,1,86835.00,86835.00,86835.00,,86835.000000000000
...,...,...,...,...,...,...,...,...,...,...,...,...
63591,213,0,Sesame Seed Black,Jueves,2011,Julio,2,154296.76,77148.38,77148.38,0,77148.380000000000
63592,213,0,Sobe - Liz Blizz,Lunes,2011,Agosto,1,64537.12,64537.12,64537.12,,64537.120000000000
63593,213,0,Spinach - Spinach Leaf,Jueves,2011,Julio,1,77148.38,77148.38,77148.38,,77148.380000000000
63594,213,0,Uniform Linen Charge,Jueves,2011,Julio,1,77148.38,77148.38,77148.38,,77148.380000000000


In [212]:
facts_df['day'] = facts_df['day'].map(dict(zip(transformed_weekdays['weekday_name'], transformed_weekdays.index)))
facts_df['day'] = facts_df['day'].fillna(-1).astype(int)

In [213]:
facts_df

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid
0,0,0,Alize Gold Passion,1,2000,Julio,1,61454.52,61454.52,61454.52,,61454.520000000000
1,0,0,Almonds Ground Blanched,9,2005,Diciembre,1,36863.95,36863.95,36863.95,,36863.950000000000
2,0,0,Almonds Ground Blanched,8,2007,Enero,1,60404.92,60404.92,60404.92,,60404.920000000000
3,0,0,Appetizer - Chicken Satay,0,2016,Agosto,1,86835.00,86835.00,86835.00,,86835.000000000000
4,0,0,Apple - Macintosh,0,2016,Agosto,1,86835.00,86835.00,86835.00,,86835.000000000000
...,...,...,...,...,...,...,...,...,...,...,...,...
63591,213,0,Sesame Seed Black,4,2011,Julio,2,154296.76,77148.38,77148.38,0,77148.380000000000
63592,213,0,Sobe - Liz Blizz,1,2011,Agosto,1,64537.12,64537.12,64537.12,,64537.120000000000
63593,213,0,Spinach - Spinach Leaf,4,2011,Julio,1,77148.38,77148.38,77148.38,,77148.380000000000
63594,213,0,Uniform Linen Charge,4,2011,Julio,1,77148.38,77148.38,77148.38,,77148.380000000000


In [214]:
facts_df['year'] = facts_df['year'].map(dict(zip(transformed_years['year'], transformed_years.index)))
facts_df['year'] = facts_df['year'].fillna(-1).astype(int)

In [215]:
facts_df

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid
0,0,0,Alize Gold Passion,1,0,Julio,1,61454.52,61454.52,61454.52,,61454.520000000000
1,0,0,Almonds Ground Blanched,9,5,Diciembre,1,36863.95,36863.95,36863.95,,36863.950000000000
2,0,0,Almonds Ground Blanched,8,7,Enero,1,60404.92,60404.92,60404.92,,60404.920000000000
3,0,0,Appetizer - Chicken Satay,0,16,Agosto,1,86835.00,86835.00,86835.00,,86835.000000000000
4,0,0,Apple - Macintosh,0,16,Agosto,1,86835.00,86835.00,86835.00,,86835.000000000000
...,...,...,...,...,...,...,...,...,...,...,...,...
63591,213,0,Sesame Seed Black,4,11,Julio,2,154296.76,77148.38,77148.38,0,77148.380000000000
63592,213,0,Sobe - Liz Blizz,1,11,Agosto,1,64537.12,64537.12,64537.12,,64537.120000000000
63593,213,0,Spinach - Spinach Leaf,4,11,Julio,1,77148.38,77148.38,77148.38,,77148.380000000000
63594,213,0,Uniform Linen Charge,4,11,Julio,1,77148.38,77148.38,77148.38,,77148.380000000000


In [216]:
facts_df['month'] = facts_df['month'].map(dict(zip(transformed_months['month_name'], transformed_months.index)))
facts_df['month'] = facts_df['month'].fillna(-1).astype(int)

In [217]:
facts_df

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid
0,0,0,Alize Gold Passion,1,0,6,1,61454.52,61454.52,61454.52,,61454.520000000000
1,0,0,Almonds Ground Blanched,9,5,11,1,36863.95,36863.95,36863.95,,36863.950000000000
2,0,0,Almonds Ground Blanched,8,7,0,1,60404.92,60404.92,60404.92,,60404.920000000000
3,0,0,Appetizer - Chicken Satay,0,16,7,1,86835.00,86835.00,86835.00,,86835.000000000000
4,0,0,Apple - Macintosh,0,16,7,1,86835.00,86835.00,86835.00,,86835.000000000000
...,...,...,...,...,...,...,...,...,...,...,...,...
63591,213,0,Sesame Seed Black,4,11,6,2,154296.76,77148.38,77148.38,0,77148.380000000000
63592,213,0,Sobe - Liz Blizz,1,11,7,1,64537.12,64537.12,64537.12,,64537.120000000000
63593,213,0,Spinach - Spinach Leaf,4,11,6,1,77148.38,77148.38,77148.38,,77148.380000000000
63594,213,0,Uniform Linen Charge,4,11,6,1,77148.38,77148.38,77148.38,,77148.380000000000


In [218]:
facts_df['product'] = facts_df['product'].map(dict(zip(df_product['product'], df_product.index)))
facts_df['product'] = facts_df['product'].fillna(-1).astype(int)

In [219]:
facts_df

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid
0,0,0,102,1,0,6,1,61454.52,61454.52,61454.52,,61454.520000000000
1,0,0,1355,9,5,11,1,36863.95,36863.95,36863.95,,36863.950000000000
2,0,0,1355,8,7,0,1,60404.92,60404.92,60404.92,,60404.920000000000
3,0,0,446,0,16,7,1,86835.00,86835.00,86835.00,,86835.000000000000
4,0,0,1235,0,16,7,1,86835.00,86835.00,86835.00,,86835.000000000000
...,...,...,...,...,...,...,...,...,...,...,...,...
63591,213,0,338,4,11,6,2,154296.76,77148.38,77148.38,0,77148.380000000000
63592,213,0,948,1,11,7,1,64537.12,64537.12,64537.12,,64537.120000000000
63593,213,0,1814,4,11,6,1,77148.38,77148.38,77148.38,,77148.380000000000
63594,213,0,1722,4,11,6,1,77148.38,77148.38,77148.38,,77148.380000000000


In [221]:
facts_df.head()

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid
0,0,0,102,1,0,6,1,61454.52,61454.52,61454.52,,61454.52
1,0,0,1355,9,5,11,1,36863.95,36863.95,36863.95,,36863.95
2,0,0,1355,8,7,0,1,60404.92,60404.92,60404.92,,60404.92
3,0,0,446,0,16,7,1,86835.0,86835.0,86835.0,,86835.0
4,0,0,1235,0,16,7,1,86835.0,86835.0,86835.0,,86835.0


In [222]:
facts_df.tail()

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid
63591,213,0,338,4,11,6,2,154296.76,77148.38,77148.38,0.0,77148.38
63592,213,0,948,1,11,7,1,64537.12,64537.12,64537.12,,64537.12
63593,213,0,1814,4,11,6,1,77148.38,77148.38,77148.38,,77148.38
63594,213,0,1722,4,11,6,1,77148.38,77148.38,77148.38,,77148.38
63595,213,0,1023,1,1,10,1,11279.5,11279.5,11279.5,,11279.5


In [223]:
facts_df.sample(10)

Unnamed: 0,country,gender,product,day,year,month,count_sale_paid,sum_sale_paid,min_sale_paid,max_sale_paid,std_sale_paid,mean_sale_paid
49721,158,1,675,1,17,5,1,34265.7,34265.7,34265.7,,34265.7
18621,44,0,853,0,0,3,1,59647.07,59647.07,59647.07,,59647.07
20371,-1,0,767,8,4,11,1,61137.54,61137.54,61137.54,,61137.54
37959,130,0,1922,9,17,0,1,73491.83,73491.83,73491.83,,73491.83
437,2,1,1029,0,17,9,1,31133.22,31133.22,31133.22,,31133.22
35377,98,0,207,8,17,10,1,69892.44,69892.44,69892.44,,69892.44
41647,154,0,1380,8,9,10,1,74596.96,74596.96,74596.96,,74596.96
42757,155,0,151,8,0,5,1,26035.48,26035.48,26035.48,,26035.48
52197,162,1,485,0,19,2,1,39895.16,39895.16,39895.16,,39895.16
34389,96,0,30,2,18,1,1,55444.77,55444.77,55444.77,,55444.77
