In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report


In [5]:
clientes = pd.read_excel("clientes.xlsx")
ventas = pd.read_excel("ventas.xlsx")
productos = pd.read_excel("productos.xlsx")

clientes.head(), ventas.head(), productos.head()


(   id_cliente   nombre_cliente                     email      ciudad  \
 0           1    Mariana Lopez    mariana.lopez@mail.com  Carlos Paz   
 1           2    Nicolas Rojas    nicolas.rojas@mail.com  Carlos Paz   
 2           3  Hernan Martinez  hernan.martinez@mail.com  Rio Cuarto   
 3           4     Uma Martinez     uma.martinez@mail.com  Carlos Paz   
 4           5  Agustina Flores  agustina.flores@mail.com     Cordoba   
 
   fecha_alta  
 0 2023-01-01  
 1 2023-01-02  
 2 2023-01-03  
 3 2023-01-04  
 4 2023-01-05  ,
    id_venta      fecha  id_cliente    nombre_cliente  \
 0         1 2024-06-19          62  Guadalupe Romero   
 1         2 2024-03-17          49      Olivia Gomez   
 2         3 2024-01-13          20      Tomas Acosta   
 3         4 2024-02-27          36    Martina Molina   
 4         5 2024-06-11          56        Bruno Diaz   
 
                        email     medio_pago  
 0  guadalupe.romero@mail.com        tarjeta  
 1      olivia.gomez@mail

In [11]:
df = clientes.merge(ventas, on="id_cliente", how="left")
df.head()



Unnamed: 0,id_cliente,nombre_cliente_x,email_x,ciudad,fecha_alta,id_venta,fecha,nombre_cliente_y,email_y,medio_pago
0,1,Mariana Lopez,mariana.lopez@mail.com,Carlos Paz,2023-01-01,54.0,2024-03-26,Mariana Lopez,mariana.lopez@mail.com,tarjeta
1,1,Mariana Lopez,mariana.lopez@mail.com,Carlos Paz,2023-01-01,105.0,2024-02-06,Mariana Lopez,mariana.lopez@mail.com,transferencia
2,2,Nicolas Rojas,nicolas.rojas@mail.com,Carlos Paz,2023-01-02,16.0,2024-04-12,Nicolas Rojas,nicolas.rojas@mail.com,efectivo
3,3,Hernan Martinez,hernan.martinez@mail.com,Rio Cuarto,2023-01-03,115.0,2024-02-16,Hernan Martinez,hernan.martinez@mail.com,transferencia
4,4,Uma Martinez,uma.martinez@mail.com,Carlos Paz,2023-01-04,,NaT,,,


In [12]:
df["cliente_activo"] = np.where(df["id_venta"].notna(), 1, 0)


In [15]:
from datetime import datetime

# antigÃ¼edad en dÃ­as
df["fecha_alta"] = pd.to_datetime(df["fecha_alta"])
df["antiguedad_dias"] = (datetime.now() - df["fecha_alta"]).dt.days

# cantidad de compras por cliente
compras_por_cliente = ventas.groupby("id_cliente")["id_venta"].count()
df["compras_totales"] = df["id_cliente"].map(compras_por_cliente).fillna(0)

# medio de pago convertido a nÃºmero (moda del cliente)
medio_pago_map = {"tarjeta": 1, "efectivo": 2, "qr": 3, "transferencia": 4}

moda_pago = ventas.groupby("id_cliente")["medio_pago"].agg(lambda x: x.mode()[0] if len(x) > 0 else None)
moda_pago_num = moda_pago.map(medio_pago_map)
df["medio_pago_num"] = df["id_cliente"].map(moda_pago_num).fillna(0)



In [16]:
X = df[["antiguedad_dias", "compras_totales", "medio_pago_num"]]
y = df["cliente_activo"]


In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


In [18]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [19]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

y_pred_knn = knn.predict(X_test_scaled)

print("MATRIZ DE CONFUSIÃ“N - KNN")
print(confusion_matrix(y_test, y_pred_knn))
print("\nREPORTE DE CLASIFICACIÃ“N - KNN")
print(classification_report(y_test, y_pred_knn))


MATRIZ DE CONFUSIÃ“N - KNN
[[12  0]
 [ 0 34]]

REPORTE DE CLASIFICACIÃ“N - KNN
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        34

    accuracy                           1.00        46
   macro avg       1.00      1.00      1.00        46
weighted avg       1.00      1.00      1.00        46



In [21]:
# importar por si no estÃ¡
from sklearn.linear_model import LogisticRegression

# MODELO 2: RegresiÃ³n LogÃ­stica
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

y_pred_log = logreg.predict(X_test_scaled)

print("ðŸ“Œ MATRIZ DE CONFUSIÃ“N â€“ RegresiÃ³n LogÃ­stica")
print(confusion_matrix(y_test, y_pred_log))

print("\nðŸ“Œ REPORTE DE CLASIFICACIÃ“N â€“ RegresiÃ³n LogÃ­stica")
print(classification_report(y_test, y_pred_log))



ðŸ“Œ MATRIZ DE CONFUSIÃ“N â€“ RegresiÃ³n LogÃ­stica
[[12  0]
 [ 0 34]]

ðŸ“Œ REPORTE DE CLASIFICACIÃ“N â€“ RegresiÃ³n LogÃ­stica
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        34

    accuracy                           1.00        46
   macro avg       1.00      1.00      1.00        46
weighted avg       1.00      1.00      1.00        46

