# 

# **Customer Data Transformation: Local CSV to Processed Data**

#### **Import necessary libraries**

In [16]:
import pandas as pd
import os

from IPython.display import display


#### **Paths to files**

In [17]:
base_path = os.path.dirname(os.getcwd()) 

raw_data_path = os.path.join(base_path, 'raw-data', 'olist_customers_dataset.csv')
processed_data_path = os.path.join(base_path, 'processed-data', 'olist_customers_dataset_transformed.parquet')

#### **Load the raw dataset**

In [20]:
print("Loading the dataset...")
df_raw = pd.read_csv(raw_data_path)
print("Dataset loaded successfully!")

# Display a sample of the data
print("Sample of the raw dataset:")
display(df_raw.head(10))


Loading the dataset...
Dataset loaded successfully!
Sample of the raw dataset:


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP
5,879864dab9bc3047522c92c82e1212b8,4c93744516667ad3b8f1fb645a3116a4,89254,jaragua do sul,SC
6,fd826e7cf63160e536e0908c76c3f441,addec96d2e059c80c30fe6871d30d177,4534,sao paulo,SP
7,5e274e7a0c3809e14aba7ad5aae0d407,57b2a98a409812fe9618067b6b8ebe4f,35182,timoteo,MG
8,5adf08e34b2e993982a47070956c5c65,1175e95fb47ddff9de6b2b06188f7e0d,81560,curitiba,PR
9,4b7139f34592b3a31687243a302fa75b,9afe194fb833f79e300e37e580171f22,30575,belo horizonte,MG


#### **Check for null values and dataset information**

In [23]:
print("\nRaw dataset information:")
df_raw.info()

print("\nCount of null values per column:")
print(df_raw.isnull().sum())


Raw dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB

Count of null values per column:
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64


#### **Apply transformations to clean the dataset**

In [24]:
print("\nTransforming the data...")
df_cleaned = (
    df_raw
    .drop_duplicates(subset=["customer_id"])  # Remove duplicates based on customer_id
)

# Display a sample of the cleaned dataset
print("Sample of the cleaned dataset:")
display(df_cleaned.head())



Transforming the data...
Sample of the cleaned dataset:


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


#### **Save the cleaned dataset**

In [27]:
print("\nSaving the cleaned dataset...")
df_cleaned.to_parquet(processed_data_path, index=False, engine='pyarrow')
print(f"Cleaned dataset saved successfully at: {processed_data_path.replace('.csv', '.parquet')}")



Saving the cleaned dataset...
Cleaned dataset saved successfully at: c:\Users\Fernando Correia\Desktop\Olist Ecommerce Tese\processed-data\olist_customers_dataset_transformed.parquet
