# **Sellers Data Transformation: Local CSV to Processed Data**

#### **Import necessary libraries**

In [1]:
import pandas as pd
import os

from IPython.display import display

#### **Paths to files**

In [2]:
base_path = os.path.dirname(os.getcwd()) 

raw_data_path = os.path.join(base_path, 'raw-data', 'olist_sellers_dataset.csv')
processed_data_path = os.path.join(base_path, 'processed-data', 'olist_sellers_dataset.parquet')

#### **Load the raw dataset**

In [3]:
print("Loading the dataset...")
df_raw = pd.read_csv(raw_data_path)
print("Dataset loaded successfully!")

# Display a sample of the data
print("Sample of the raw dataset:")
display(df_raw.head(10))

Loading the dataset...
Dataset loaded successfully!
Sample of the raw dataset:


Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP
5,c240c4061717ac1806ae6ee72be3533b,20920,rio de janeiro,RJ
6,e49c26c3edfa46d227d5121a6b6e4d37,55325,brejao,PE
7,1b938a7ec6ac5061a66a3766e0e75f90,16304,penapolis,SP
8,768a86e36ad6aae3d03ee3c6433d61df,1529,sao paulo,SP
9,ccc4bbb5f32a6ab2b7066a4130f114e3,80310,curitiba,PR


#### **Check for null values and dataset information**

In [4]:
print("\nRaw dataset information:")
df_raw.info()

print("\nCount of null values per column:")
print(df_raw.isnull().sum())


Raw dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: int64(1), object(3)
memory usage: 96.8+ KB

Count of null values per column:
seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64


#### **Apply transformations to clean the dataset**

In [5]:
print("\nTransforming the data...")
df_cleaned = (
    df_raw
    .drop_duplicates(subset=["seller_id"]) 
)


# Display the transformed dataset
print("Sample of the transformed dataset:")
display(df_cleaned.head())

df_cleaned.info()

# Count the total rows in the cleaned dataset
cleaned_row_count = df_cleaned.shape[0]
print(f"Total rows in the cleaned dataset: {cleaned_row_count}")


Transforming the data...
Sample of the transformed dataset:


Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: int64(1), object(3)
memory usage: 96.8+ KB
Total rows in the cleaned dataset: 3095


#### **Save the cleaned dataset**

In [6]:
print("\nSaving the cleaned dataset...")
df_cleaned.to_parquet(processed_data_path, index=False, engine='pyarrow')
print(f"Cleaned dataset saved successfully at: {processed_data_path.replace('.csv', '.parquet')}")


Saving the cleaned dataset...
Cleaned dataset saved successfully at: c:\Users\Fernando Correia\Desktop\Olist Ecommerce Tese\processed-data\olist_sellers_dataset.parquet
