In [1]:
# Importing the necessary libraries
from pandas import DataFrame
from sqlalchemy import create_engine
from typing import Dict
from pathlib import Path
import pandas as pd

from sqlalchemy.engine.base import Engine
from src.transform import QueryEnum
from src import config
from src.transform import run_queries
from src.extract import extract
from src.load import load
from src.plots import (
    plot_freight_value_weight_relationship,
    plot_global_amount_order_status,
    plot_real_vs_predicted_delivered_time,
    plot_revenue_by_month_year,
    plot_revenue_per_state,
    plot_top_10_least_revenue_categories,
    plot_top_10_revenue_categories,
    plot_top_10_revenue_categories_ammount,
    plot_delivery_date_difference,
    plot_order_amount_per_day_with_holidays,
)

# Create the database sql file
Path(config.SQLITE_BD_ABSOLUTE_PATH).touch()

# Create the database connection
ENGINE = create_engine(rf"sqlite:///{config.SQLITE_BD_ABSOLUTE_PATH}", echo=False)

csv_folder = config.DATASET_ROOT_PATH
public_holidays_url = config.PUBLIC_HOLIDAYS_URL

# 1. Get the mapping of the csv files to the table names.
csv_table_mapping = config.get_csv_to_table_mapping()

# 2. Extract the data from the csv files, holidays and load them into the dataframes.
csv_dataframes = extract(csv_folder, csv_table_mapping, public_holidays_url)

load(data_frames=csv_dataframes, database=ENGINE)

query_results: Dict[str, DataFrame] = run_queries(database=ENGINE)

All dataframes have been loaded into the database.


In [2]:
# Crear la conexión a la base de datos
engine = create_engine(rf"sqlite:///{config.SQLITE_BD_ABSOLUTE_PATH}", echo=False)

# Cargar los datos desde la base de datos SQLite en DataFrames de pandas
customers_df = pd.read_sql_table('olist_customers', con=engine)
geolocation_df = pd.read_sql_table('olist_geolocation', con=engine)
order_items_df = pd.read_sql_table('olist_order_items', con=engine)
order_payments_df = pd.read_sql_table('olist_order_payments', con=engine)
order_reviews_df = pd.read_sql_table('olist_order_reviews', con=engine)
orders_df = pd.read_sql_table('olist_orders', con=engine)
products_df = pd.read_sql_table('olist_products', con=engine)
sellers_df = pd.read_sql_table('olist_sellers', con=engine)
product_category_name_translation_df = pd.read_sql_table('product_category_name_translation', con=engine)
public_holidays_df = pd.read_sql_table('public_holidays', con=engine)

In [10]:
import pandas as pd
from sqlalchemy import create_engine

# Crear la conexión a la base de datos
engine = create_engine(f"sqlite:///{config.SQLITE_BD_ABSOLUTE_PATH}", echo=False)

# Cargar los datos desde la base de datos SQLite en DataFrames de pandas
dataframes = {
    'customers': pd.read_sql_table('olist_customers', con=engine),
    'geolocation': pd.read_sql_table('olist_geolocation', con=engine),
    'order_items': pd.read_sql_table('olist_order_items', con=engine),
    'order_payments': pd.read_sql_table('olist_order_payments', con=engine),
    'order_reviews': pd.read_sql_table('olist_order_reviews', con=engine),
    'orders': pd.read_sql_table('olist_orders', con=engine),
    'products': pd.read_sql_table('olist_products', con=engine),
    'sellers': pd.read_sql_table('olist_sellers', con=engine),
    'product_category_name_translation': pd.read_sql_table('product_category_name_translation', con=engine),
    'public_holidays': pd.read_sql_table('public_holidays', con=engine)
}

# Ruta del archivo TXT donde se guardarán los datos
output_file_path = 'top_10_rows_combined.txt'

# Guardar las primeras 10 filas de cada DataFrame en un único archivo TXT
with open(output_file_path, mode='w', encoding='utf-8') as file:
    for df_name, df in dataframes.items():
        file.write(f"{df_name}\n")
        file.write(df.head(10).to_string(index=False))
        file.write("\n\n")

print(f"Las primeras 10 filas de cada DataFrame se han guardado en {output_file_path}")

Las primeras 10 filas de cada DataFrame se han guardado en top_10_rows_combined.txt


In [3]:
print("\nOrders dataset:")
orders_df


Orders dataset:


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00
...,...,...,...,...,...,...,...,...
99436,9c5dedf39a927c1b2549525ed64a053c,39bd1228ee8140590ac3aca26f2dfe00,delivered,2017-03-09 09:54:05,2017-03-09 09:54:05,2017-03-10 11:18:03,2017-03-17 15:08:01,2017-03-28 00:00:00
99437,63943bddc261676b46f01ca7ac2f7bd8,1fca14ff2861355f6e5f14306ff977a7,delivered,2018-02-06 12:58:58,2018-02-06 13:10:37,2018-02-07 23:22:42,2018-02-28 17:37:56,2018-03-02 00:00:00
99438,83c1379a015df1e13d02aae0204711ab,1aa71eb042121263aafbe80c1b562c9c,delivered,2017-08-27 14:46:43,2017-08-27 15:04:16,2017-08-28 20:52:26,2017-09-21 11:24:17,2017-09-27 00:00:00
99439,11c177c8e97725db2631073c19f07b62,b331b74b18dc79bcdf6532d51e1637c1,delivered,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-25 23:32:54,2018-02-15 00:00:00


In [4]:
print("\nOrder Items dataset:")
order_items_df


Order Items dataset:


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.90,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.90,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.00,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.90,18.14
...,...,...,...,...,...,...,...
112645,fffc94f6ce00a00581880bf54a75a037,1,4aa6014eceb682077f9dc4bffebc05b0,b8bc237ba3788b23da09c0f1f3a3288c,2018-05-02 04:11:01,299.99,43.41
112646,fffcd46ef2263f404302a634eb57f7eb,1,32e07fd915822b0765e448c4dd74c828,f3c38ab652836d21de61fb8314b69182,2018-07-20 04:31:48,350.00,36.53
112647,fffce4705a9662cd70adb13d4a31832d,1,72a30483855e2eafc67aee5dc2560482,c3cfdc648177fdbbbb35635a37472c53,2017-10-30 17:14:25,99.90,16.95
112648,fffe18544ffabc95dfada21779c9644f,1,9c422a519119dcad7575db5af1ba540e,2b3e4a2a3ea8e01938cabda2a3e5cc79,2017-08-21 00:04:32,55.99,8.72


In [5]:
# Obtener los valores únicos de la columna order_status
unique_order_status = orders_df['order_status'].unique()

# Mostrar los valores únicos
print("Valores únicos en la columna order_status:")
unique_order_status

Valores únicos en la columna order_status:


array(['delivered', 'invoiced', 'shipped', 'processing', 'unavailable',
       'canceled', 'created', 'approved'], dtype=object)