In [1]:
import pandas as pd
import polars as pl
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

In [2]:
!gsutil cp /home/eanegrin/buckets/b1/datasets/competencia_03_inflacion_adj_ipc.parquet /home/eanegrin/datasets/

Copying file:///home/eanegrin/buckets/b1/datasets/competencia_03_inflacion_adj_ipc.parquet...
- [1 files][861.8 MiB/861.8 MiB]                                                
Operation completed over 1 objects/861.8 MiB.                                    


In [3]:
# base_path = 'C:/Eugenio/Maestria/DMEyF/'
base_path = '/home/eanegrin/buckets/b1/'

dataset_path = base_path + 'datasets/'
dataset_file = 'competencia_03_inflacion_adj_ipc.parquet'

data = pd.read_parquet('/home/eanegrin/datasets/' + dataset_file)
# data = pd.read_parquet(dataset_path + dataset_file)

In [4]:
data.shape

(4901237, 155)

In [5]:
data['foto_mes'] = data['foto_mes'].astype('string')

data[['tmobile_app', 'cmobile_app_trx']] = data[['tmobile_app', 'cmobile_app_trx']].astype('float')

data.drop(columns = ['clase_ternaria'], inplace = True)

### Nulos, Ceros y medias

In [10]:
nulos = data.groupby('foto_mes').agg(lambda x: x.isna().mean() * 100).reset_index()
zeros = data.groupby('foto_mes').agg(lambda x: (x == 0).mean() * 100).reset_index()
medias = data.groupby('foto_mes').mean().reset_index()

foto_mes = nulos['foto_mes']

with PdfPages('combined_charts.pdf') as pdf:
    for variable in nulos.columns[2:]:  # Salteamos index y foto_mes
        fig, ax1 = plt.subplots(figsize=(12, 6))
        
        # eje primario: % de nulos y de ceros
        ax1.plot(foto_mes, nulos[variable], marker='o', label=f'% Nulls ({variable})', color='red')
        ax1.plot(foto_mes, zeros[variable], marker='^', label=f'% Zeros ({variable})', color='blue')
        ax1.set_xlabel("foto_mes", fontsize=12)
        ax1.set_ylabel("% Values", fontsize=12, color='black')
        ax1.tick_params(axis='y', labelcolor='black')
        ax1.set_ylim(bottom=0)
        ax1.grid(True, linestyle='--', alpha=0.5)

        plt.xticks(rotation=45)
        
        # eje secundario: promedios
        ax2 = ax1.twinx()
        ax2.plot(foto_mes, medias[variable], marker='', label=f'Average ({variable})', color='green')
        ax2.set_ylabel("Average Value", fontsize=12, color='green')
        ax2.tick_params(axis='y', labelcolor='green')
        ax2.set_ylim(bottom=0)
        
        fig.suptitle(f"Variable: {variable}", fontsize=14)
        fig.tight_layout()
        
        lines, labels = ax1.get_legend_handles_labels()
        lines2, labels2 = ax2.get_legend_handles_labels()
        ax1.legend(lines + lines2, labels + labels2, loc='upper left')
        
        pdf.savefig()
        plt.close(fig)

### Charts individuales

In [7]:
# nulos = data.groupby('foto_mes').agg(lambda x: x.isna().mean() * 100)

# nulos = nulos.reset_index()

# with PdfPages('nulos_charts.pdf') as pdf:

#     for variable in nulos.columns[2:]: # to skip index and foto_mes
#         plt.figure(figsize=(10, 5))
#         plt.plot(nulos['foto_mes'], nulos[variable], marker='o', label=variable)
#         plt.title(f"Porcentaje de valores nulos: {variable}")
#         plt.xlabel("foto_mes")
#         plt.ylabel("% de valores nulos")

#         plt.ylim(bottom=0)
        
#         plt.xticks(rotation=45)
        
#         plt.legend()
#         plt.grid(True, linestyle='--', alpha=0.5)
#         plt.tight_layout()
        
#         pdf.savefig()
#         plt.close() 


In [9]:
# zeros = data.groupby('foto_mes').agg(lambda x: (x == 0).mean() * 100)

# zeros = zeros.reset_index()

# with PdfPages('zeros_charts.pdf') as pdf:

#     for variable in zeros.columns[2:]: # to skip index and foto_mes
#         plt.figure(figsize=(10, 5))
#         plt.plot(zeros['foto_mes'], zeros[variable], marker='o', label=variable)
#         plt.title(f"Porcentaje de valores == 0: {variable}")
#         plt.xlabel("foto_mes")
#         plt.ylabel("% de valores == 0")

#         plt.ylim(bottom=0)
        
#         plt.xticks(rotation=45)
        
#         plt.legend()
#         plt.grid(True, linestyle='--', alpha=0.5)
#         plt.tight_layout()
        
#         pdf.savefig()
#         plt.close()

In [11]:
# medias = data.groupby('foto_mes').mean()

# medias = medias.reset_index()

# with PdfPages('medias_charts.pdf') as pdf:

#     for variable in medias.columns[2:]: # to skip index and foto_mes
#         plt.figure(figsize=(10, 5))
#         plt.plot(medias['foto_mes'], medias[variable], marker='o', label=variable)
#         plt.title(f"Promedio: {variable}")
#         plt.xlabel("foto_mes")
#         plt.ylabel("Promedio")

#         plt.ylim(bottom=0)
        
#         plt.xticks(rotation=45)
        
#         plt.legend()
#         plt.grid(True, linestyle='--', alpha=0.5)
#         plt.tight_layout()
        
#         pdf.savefig()
#         plt.close()