# **Geolocation Data Transformation: Local CSV to Processed Data**

#### **Import necessary libraries**

In [1]:
import pandas as pd
import os

from IPython.display import display

#### **Paths to files**

In [2]:
base_path = os.path.dirname(os.getcwd()) 

raw_data_path = os.path.join(base_path, 'raw-data', 'olist_geolocation_dataset.csv')
processed_data_path = os.path.join(base_path, 'processed-data', 'geolocation.parquet')

#### **Load the raw dataset**

In [3]:
print("Loading the dataset...")
df_raw = pd.read_csv(raw_data_path)
print("Dataset loaded successfully!")

# Display a sample of the data
print("Sample of the raw dataset:")
display(df_raw.head(10))

Loading the dataset...
Dataset loaded successfully!
Sample of the raw dataset:


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP
5,1012,-23.547762,-46.635361,são paulo,SP
6,1047,-23.546273,-46.641225,sao paulo,SP
7,1013,-23.546923,-46.634264,sao paulo,SP
8,1029,-23.543769,-46.634278,sao paulo,SP
9,1011,-23.54764,-46.636032,sao paulo,SP


#### **Check for null values and dataset information**

In [4]:
print("\nRaw dataset information:")
df_raw.info()

print("\nCount of null values per column:")
print(df_raw.isnull().sum())


Raw dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  int64  
 1   geolocation_lat              1000163 non-null  float64
 2   geolocation_lng              1000163 non-null  float64
 3   geolocation_city             1000163 non-null  object 
 4   geolocation_state            1000163 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 38.2+ MB

Count of null values per column:
geolocation_zip_code_prefix    0
geolocation_lat                0
geolocation_lng                0
geolocation_city               0
geolocation_state              0
dtype: int64


#### **Apply transformations to clean the dataset**

In [5]:
print("\nTransforming the data...")
df_cleaned = (
    df_raw
    .drop_duplicates(subset=["geolocation_zip_code_prefix", "geolocation_lat", "geolocation_lng"])  # Remove duplicates based on specified columns
)

# Display a sample of the cleaned dataset
print("Sample of the cleaned dataset:")
display(df_cleaned.head())



Transforming the data...
Sample of the cleaned dataset:


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP


#### **Save the cleaned dataset**

In [6]:
print("\nSaving the cleaned dataset...")
df_cleaned.to_parquet(processed_data_path, index=False, engine='pyarrow')
print(f"Cleaned dataset saved successfully at: {processed_data_path.replace('.csv', '.parquet')}")


Saving the cleaned dataset...
Cleaned dataset saved successfully at: c:\Users\Fernando Correia\Desktop\Olist Ecommerce Tese\processed-data\geolocation.parquet
