<a href="https://colab.research.google.com/github/christiangarza1505813/Tesina/blob/main/Model/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip -q install pandas pyarrow scikit-learn joblib

import io, os, textwrap, joblib, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_absolute_error, mean_squared_error


In [7]:
CSV_URL = "https://raw.githubusercontent.com/christiangarza1505813/Tesina/main/Datos/user_clasification.csv"  # <-- reemplaza

df = pd.read_csv(
    CSV_URL,
)

In [40]:
df['YMD'] = pd.to_datetime(df['YMD'])
df_sorted = df.sort_values(by=['AccountUuid', 'YMD'])
df = df_sorted.drop_duplicates(subset=['AccountUuid'], keep='first')
df.describe()

Unnamed: 0,YMD,Plaid,Issued_Card,Transacting,Pmt_method,YearOfBirth,postal_code,lat,lng,density
count,67678,67678.0,67678.0,67678.0,67678.0,67678.0,67678.0,67678.0,67678.0,67678.0
mean,2024-06-23 07:05:41.605839104,0.042362,0.608839,0.31381,0.132672,1988.604968,64218.32368,35.987574,-98.18396,1668.881913
min,2022-08-08 00:00:00,0.0,0.0,0.0,0.0,1900.0,10002.0,19.30612,-166.47125,0.0
25%,2023-12-25 06:00:00,0.0,0.0,0.0,0.0,1983.0,37241.25,32.93417,-115.12313,144.7
50%,2024-05-09 00:00:00,0.0,1.0,0.0,0.0,1990.0,75141.0,35.24922,-95.8701,821.3
75%,2024-12-30 00:00:00,0.0,1.0,1.0,0.0,1996.0,89104.0,39.72389,-84.54956,1931.5
max,2025-09-13 00:00:00,1.0,1.0,1.0,1.0,2007.0,99901.0,67.25435,-72.37223,58289.6
std,,0.201416,0.488014,0.464043,0.339223,10.402508,26328.559487,4.846153,15.499654,3232.997743


#Calidad de los datos


##% de nulos

In [39]:

obj_cols = df.select_dtypes(include=['object', 'string']).columns

# Máscaras de vacíos y nulos
mask_empty = pd.DataFrame(False, index=df.index, columns=df.columns)
if len(obj_cols):
    mask_empty[obj_cols] = df[obj_cols].astype('string').apply(lambda s: s.str.strip().eq(''))

mask_null = df.isna()

# Resumen en porcentaje
resumen = pd.DataFrame({
    'Tipo': df.dtypes.astype(str),
    '% nulos o vacíos': ((mask_null | mask_empty).mean()*100).round(2)
}).sort_values('% nulos o vacíos', ascending=False)

sty = (resumen.style
       .format({'% nulos':'{:.2f} %','% vacíos':'{:.2f} %','% nulos o vacíos':'{:.2f} %'})
       .set_table_styles([
           {'selector': 'table', 'props': 'border-collapse:collapse; border:1px solid black; background:white; color:black; font-family:"Times New Roman",serif; font-size:12px;'},
           {'selector': 'th',    'props': 'border:1px solid black; padding:4px; text-align:center; font-weight:bold;'},
           {'selector': 'td',    'props': 'border:1px solid black; padding:4px; text-align:right;'},
           {'selector': 'th.row_heading', 'props': 'text-align:left;'}  # nombres de columnas (índice) alineados a la izquierda
       ])
)
display(sty)

Unnamed: 0,Tipo,% nulos o vacíos
AccountUuid,object,0.00 %
YMD,datetime64[ns],0.00 %
Creation_date,object,0.00 %
Plaid,float64,0.00 %
Issued_Card,float64,0.00 %
Transacting,float64,0.00 %
Pmt_method,float64,0.00 %
YearOfBirth,float64,0.00 %
HaveSSN,object,0.00 %
postal_code,int64,0.00 %


##Datos únicos

In [44]:


n = len(df)
non_null = df.notna().sum()
uniq_no_na = df.nunique(dropna=True)   # distintos excluyendo NaN
uniq_with_na = df.nunique(dropna=False) # cuenta NaN como categoría

resumen = pd.DataFrame({
    'N': n,
    'Distintos': uniq_no_na,
    '% únicos / total': np.where(n>0, (uniq_no_na / n * 100).round(2), 0.0)
}).sort_values('% únicos / total', ascending=False)

# Estilo blanco y negro (tipo “IEEE”)
sty = (resumen.style
       .format({'% únicos / no nulos':'{:.2f} %', '% únicos / total':'{:.2f} %'})
       .set_table_styles([
           {'selector': 'table', 'props': 'border-collapse:collapse; border:1px solid black; background:white; color:black; font-family:"Times New Roman",serif; font-size:12px;'},
           {'selector': 'th',    'props': 'border:1px solid black; padding:4px; text-align:center; font-weight:bold;'},
           {'selector': 'td',    'props': 'border:1px solid black; padding:4px; text-align:right;'},
           {'selector': 'th.row_heading', 'props': 'text-align:left;'}
       ])
)
display(sty)


Unnamed: 0,N,Distintos,% únicos / total
AccountUuid,67678,67678,100.00 %
postal_code,67678,10474,15.48 %
lng,67678,10452,15.44 %
lat,67678,10446,15.43 %
density,67678,6212,9.18 %
city,67678,5252,7.76 %
YMD,67678,992,1.47 %
Creation_date,67678,998,1.47 %
YearOfBirth,67678,77,0.11 %
state_name,67678,44,0.07 %
