In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder


try:
    df_final = pd.read_csv('../data/Processed/df_final.csv')
    print("DataFrame final cargado exitosamente.")
except FileNotFoundError:
    print("Error: El archivo 'df_final.csv' no se encontró. Asegúrate de que tu ruta es correcta.")

DataFrame final cargado exitosamente.


In [48]:
# Simular la variable 'retained' si total_watch_time_hours > media
mean_watch_time = df_final['total_watch_time_hours'].mean()
df_final['retained'] = (df_final['total_watch_time_hours'] > mean_watch_time).astype(int)

# Verificar la distribución de la variable objetivo
print("Distribución de 'retained':")
print(df_final['retained'].value_counts())
print(f"Proporción de 'retained': {df_final['retained'].mean():.2f}")

Distribución de 'retained':
retained
0    121807
1    100978
Name: count, dtype: int64
Proporción de 'retained': 0.45


In [36]:
encoder = OneHotEncoder(sparse_output=False, drop='first')
cluster_encoded = encoder.fit_transform(df_final[['cluster']])
cluster_df = pd.DataFrame(cluster_encoded, columns=encoder.get_feature_names_out(['cluster']))

In [37]:
df_final = pd.concat([df_final.reset_index(drop=True), cluster_df.reset_index(drop=True)], axis=1)


In [38]:
features = ['age', 'watch_duration_minutes', 'completion_percentage', 'cluster_1', 'cluster_2']
X = df_final[features]
# Variable objetivo (y)
y = df_final['retained']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("\nModelo de Random Forest entrenado exitosamente.")


Modelo de Random Forest entrenado exitosamente.


In [42]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [43]:
print("\n--- Métricas de Evaluación del Modelo ---")
print(f"Accuracy (Precisión): {accuracy:.2f}")
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred))


--- Métricas de Evaluación del Modelo ---
Accuracy (Precisión): 0.62

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.64      0.68      0.66     24447
           1       0.58      0.54      0.56     20110

    accuracy                           0.62     44557
   macro avg       0.61      0.61      0.61     44557
weighted avg       0.62      0.62      0.62     44557



In [44]:
print("\n--- Importancia de las Variables del Modelo (Feature Importance) ---")
feature_importance = pd.DataFrame({
    'Variable': features,
    'Importancia': model.feature_importances_
}).sort_values(by='Importancia', ascending=False)
print(feature_importance)


--- Importancia de las Variables del Modelo (Feature Importance) ---
                 Variable  Importancia
2   completion_percentage     0.442449
1  watch_duration_minutes     0.309416
0                     age     0.158399
3               cluster_1     0.063969
4               cluster_2     0.025767
