# Data Preprocessing Notebook
This notebook focuses on cleaning, preprocessing, and preparing the dataset for modeling.   
Additionally, exploratory data analysis (EDA) is performed using Pandas.

### Inicialize Spark

In [None]:
spark

### Import libraries

In [None]:
from src.data_preprocessing import DataPreprocessor
from src.historical_data_viz import HistoricalDataVisualizer
import pandas as pd

### Load data

In [None]:
df_offers = spark.read.json('~/ifood_case/data/offers.json')
df_transactions = spark.read.json('~/ifood_case/data/transactions.json')
df_profile = spark.read.json('~/ifood_case/data/profile.json')

### Use **DataPreprocessor** class to preprocess the data

In [None]:
df_processing = DataPreprocessor(df_profile, df_transactions)
df_procesed, df_model = df_processing.transform()

### Save processed data

In [None]:
df_procesed.repartition(1).write.mode("overwrite").parquet('~/ifood_case/data/df_all.parquet')
df_model.repartition(1).write.mode("overwrite").parquet('~/ifood_case/data/df_model.parquet')

### Read the processed data as Pandas DataFrames
This step is required for plotting.

In [None]:
df_procesed = pd.read_parquet('~/ifood_case/data/processed/df_all.parquet')
df_model = pd.read_parquet('~/ifood_case/data/processed/df_model.parquet')

### Use **HistoricalDataVisualizer** class to generate data exploration visualizations

In [None]:
viz = HistoricalDataVisualizer(df_procesed,df_model)
viz.plot_historical_data()