In [1]:
import re
from datetime import datetime

import nltk
import pandas as pd
from data_analysis_octopus import *
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score, r2_score
from sklearn.model_selection import (GridSearchCV, cross_val_score,
                                     train_test_split)
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from unidecode import unidecode
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor, LogisticRegression
from xgboost import XGBClassifier


nltk.download('stopwords')
stop_words = set(stopwords.words('spanish'))  # Puedes cambiar 'spanish' por el idioma que necesites


def freq_discrete(df, features):
    for feature in features:
        print(f"Feature: {feature}")
        abs_ = df[feature].value_counts(dropna=False).to_frame().rename(columns={"count": "Absolute frequency"})
        rel_ = df[feature].value_counts(dropna=False, normalize= True).to_frame().rename(columns={"proportion": "Relative frequency"})
        freq = abs_.join(rel_)
        freq["Accumulated frequency"] = freq["Absolute frequency"].cumsum()
        freq["Accumulated %"] = freq["Relative frequency"].cumsum()
        freq["Absolute frequency"] = freq["Absolute frequency"].map(lambda x: "{:,.0f}".format(x))
        freq["Relative frequency"] = freq["Relative frequency"].map(lambda x: "{:,.2%}".format(x))
        freq["Accumulated frequency"] = freq["Accumulated frequency"].map(lambda x: "{:,.0f}".format(x))
        freq["Accumulated %"] = freq["Accumulated %"].map(lambda x: "{:,.2%}".format(x))
        display(freq)


def extract_date(text):
    try:
        date_string = text.split(", ")[0]

        return datetime.strptime(date_string, "%m/%d/%y").date()
    
    except:
        return ""
    
def convert_hour_format(hour_str):
    hour_obj = datetime.strptime(hour_str, '%I:%M %p')
    # Format the object datetime into 24 hours format
    hour_24h = hour_obj.strftime('%H:%M')
    
    return hour_24h


def extract_hour(text):
    try: 
        return convert_hour_format(
            text.split(", ")[1].split(" - ")[0].replace("\u202f", " ")
        )
    
    except:
        return ""
    

def get_weekday(date):
    # Obtener el nombre del día de la semana
    weekday = date.strftime('%A')
    
    # Mapear el nombre del día de la semana a las iniciales
    days = {'Monday': 'L', 'Tuesday': 'M', 'Wednesday': 'X', 'Thursday': 'J',
            'Friday': 'V', 'Saturday': 'S', 'Sunday': 'D'}
    
    # Devolver la inicial del día de la semana
    return days[weekday]


def extract_name(text):
    try:
        return text.split(", ")[1].split("- ")[1].split(":")[0]
    
    except:
        return ""

def extract_message(text):
    try:
        return text.split(", ")[1].split("- ")[1].split(": ")[1]
    except:
        return ""
    
def extract_data(text):
    date = extract_date(text)
    hour = extract_hour(text)
    name = extract_name(text)
    message = extract_message(text)

    return (date, hour, name, message)


def group_hours(data: pd.DataFrame, column: str):
    bins = data["hour"].quantile([0, 0.25, 0.5, 0.75, 1]).values.astype(int)
    labels = [f"{bins[i]}_a_{bins[i+1]}" for i in range(len(bins)-1)]

    return pd.cut(data[column], bins=bins, labels=labels, right=False)


def clean_text(texto):
    # Convertir a minúsculas y quitar acentos
    texto = unidecode(texto.lower())
    
    # Quitar caracteres que no sean letras
    texto = re.sub(r'[^a-zA-Z]', ' ', texto)
    
    # Quitar stopwords
    palabras = texto.split()
    palabras = [palabra for palabra in palabras if palabra not in stop_words]
    
    # Unir las palabras nuevamente en un solo string
    texto_limpio = ' '.join(palabras)
    
    return texto_limpio


def transform_text_with_svd(data, column, n_components=3, vectorizer=None, svd=None):
    # Paso 1: Vectorización TF-IDF
    if vectorizer is None:
        vectorizer = TfidfVectorizer()
        vectorized_message = vectorizer.fit_transform(data[column])
    else:
        vectorized_message = vectorizer.transform(data[column])

    # Paso 2: Aplicar TruncatedSVD en lugar de PCA para matrices dispersas
    # TruncatedSVD es más eficiente para matrices dispersas generadas por TF-IDF
    if svd is None:
        svd = TruncatedSVD(n_components=n_components)
        transformed_svd = svd.fit_transform(vectorized_message)
    else:
        transformed_svd = svd.transform(vectorized_message)

    svd_df = pd.DataFrame(transformed_svd, columns=[f"component_{n}" for n in range(n_components)])
    
    return vectorizer, svd, svd_df


def train_regressor_model(X, y, model, param_grid=None):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    best_model = model()

    if param_grid:
        grid_search = GridSearchCV(best_model, param_grid, cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_

        print("Mejores hiperparámetros encontrados:")
        print(grid_search.best_params_)

    best_model.fit( X_train, y_train)

    # Evaluación del modelo con validación cruzada
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=10, scoring="r2")
    train_score = cv_scores.mean()

    print("R2 train:", train_score)

    y_pred = best_model.predict(X_test)
    test_score = r2_score(y_test, y_pred)
    print("R2 test:", test_score)

    # Obtener las características más importantes
    feature_importances = best_model.coef_
    feature_names = X_train.columns
    feature_importances_df = pd.DataFrame({
        'Característica': feature_names,
        'Importancia': np.abs(feature_importances),
        "Coef": feature_importances
    }).sort_values("Importancia", ascending=False)

    return feature_importances_df


def train_classifier_model(X, y, model, param_grid=None):
    # Dividir los datos en conjuntos de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Ajustar y transformar los datos de entrenamiento
    vectorizer, svd, svd_df_train = transform_text_with_svd(X_train, "message")

    # Combinar todas las características del conjunto de entrenamiento
    X_train = pd.concat([svd_df_train, X_train.reset_index(drop=True)], axis=1)
    X_train = X_train.drop(columns=["message"])

    # Transformar los datos de prueba usando los objetos ajustados del conjunto de entrenamiento
    _, _, svd_df_test = transform_text_with_svd(X_test, "message", vectorizer=vectorizer, svd=svd)

    # Combinar todas las características del conjunto de prueba
    X_test = pd.concat([svd_df_test, X_test.reset_index(drop=True)], axis=1)
    X_test = X_test.drop(columns=["message"])
    
    # Crear el clasificador base
    if 'n_jobs' in model.__dict__:
        best_model = model(n_jobs=-1)
    else:
        best_model = model()

    # Si se proporciona un grid de parámetros, realizar GridSearchCV
    if param_grid:
        grid_search = GridSearchCV(best_model, param_grid, cv=5, scoring="f1_macro", n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # Obtener los mejores hiperparámetros y el mejor modelo
        best_model = grid_search.best_estimator_
        
        print("Mejores hiperparámetros encontrados GridSearchCV:")
        print(grid_search.best_params_)

    # Entrenar el modelo con los mejores parámetros encontrados o el modelo base
    best_model.fit(X_train, y_train)

    # Evaluación del modelo con validación cruzada
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=10, scoring='f1_macro')
    train_score = cv_scores.mean()


    # Predicción y evaluación en el conjunto de prueba
    y_pred = best_model.predict(X_test)
    print("\nReporte de Clasificación en el conjunto de prueba:")
    print(classification_report(y_test, y_pred))

    # Evaluación del modelo en el conjunto de prueba
    test_score = f1_score(y_test, y_pred, average='macro')

    print("F1 Macro Score de validación cruzada (train):", train_score)
    print("F1 Macro Score en el conjunto de prueba (test):", test_score)

    if hasattr(best_model, 'coef_'):
    # Obtener los coeficientes del modelo
        feature_importances = best_model.coef_[0]
        abs_coef = np.abs(feature_importances)
        coef_abs_tuples = [(feature_importances[i], abs_coef[i]) for i in range(len(feature_importances))]

        # Ordenar la lista de tuplas por el valor absoluto en orden descendente
        sorted_coef_abs_tuples = sorted(coef_abs_tuples, key=lambda x: x[1], reverse=True)
        feature_importances = [coef[0] for coef in sorted_coef_abs_tuples]

    else:
        feature_importances = best_model.feature_importances_

    feature_names = X_train.columns
    feature_importances_df = pd.DataFrame({
        'Característica': feature_names,
        'Importancia': feature_importances
    }).sort_values('Importancia', ascending=False)

    return feature_importances_df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
raw_df = pd.read_csv("../data/WhatsApp-Chat.txt", sep='\t')
raw_df.columns = ["raw_text"]

df = raw_df.copy()
df[["date", "hours_minutes", "user", "message"]] = df["raw_text"].apply(lambda row: pd.Series(extract_data(row)))

df = df[df["date"] != ""]

df["weekday"] = df["date"].apply(lambda row: get_weekday(row))
df["message_lenght"] = df["message"].str.len()
df[["hour", "minutes"]] = df["hours_minutes"].str.split(":", expand=True)

df["hour"] = df["hour"].astype(int)

df["hours_group"] = group_hours(df, column="hour")

df["message"] = df["message"].apply(clean_text)

df = df[df["user"].isin(["User_1", "User_2"])]

df.shape

(2667, 10)

In [5]:
df["user"].value_counts()

user
User_2    1479
User_1    1188
Name: count, dtype: int64

In [3]:
df["hours_group"].value_counts()

hours_group
10_a_14    846
14_a_17    680
17_a_23    621
0_a_10     507
Name: count, dtype: int64

In [4]:
df["weekday"].value_counts()

weekday
V    497
M    425
J    399
L    385
D    373
X    345
S    243
Name: count, dtype: int64

In [6]:
categoricas = [
    # "user",
    "weekday",
    "hours_group",
    # "minutes",
]

numericas = [
    "message_lenght",
]

dummies_df = pd.get_dummies(df[categoricas], dtype=int)
df = pd.concat([df, dummies_df], axis=1)

In [7]:
label_encoder = LabelEncoder()
df['encoded_user'] = label_encoder.fit_transform(df['user'])

In [8]:
mask = [
    "raw_text",
    "date",
    "hours_minutes",
    "user",
    "hour",
    "minutes",
    "weekday",
    # "message",
    "hours_group",
    "encoded_user"
]

X = df[df.columns[df.columns.isin(mask) == False]]
y = df["encoded_user"]

### Modelación

In [40]:
parametros = {
    'max_depth': [None, 3, 5, 7, 10],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random']
}

# Llamada a la función con los parámetros y datos
feature_importances_df = train_classifier_model(X, y, model=DecisionTreeClassifier, param_grid=parametros)
feature_importances_df

Mejores hiperparámetros encontrados GridSearchCV:
{'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 6, 'min_samples_split': 15, 'splitter': 'best'}

Reporte de Clasificación en el conjunto de prueba:
              precision    recall  f1-score   support

           0       0.71      0.73      0.72       442
           1       0.66      0.64      0.64       359

    accuracy                           0.69       801
   macro avg       0.68      0.68      0.68       801
weighted avg       0.69      0.69      0.69       801

F1 Macro Score de validación cruzada (train): 0.6819459010007799
F1 Macro Score en el conjunto de prueba (test): 0.6822659281091716


Unnamed: 0,Característica,Importancia
0,component_0,0.320141
2,component_2,0.301105
3,message_lenght,0.214239
12,hours_group_10_a_14,0.0235
5,weekday_J,0.021502
14,hours_group_17_a_23,0.021269
13,hours_group_14_a_17,0.014541
4,weekday_D,0.014489
8,weekday_S,0.012957
11,hours_group_0_a_10,0.012469


In [44]:
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300, 400, 500]
}

# Llamada a la función con los parámetros y datos
feature_importances_df = train_classifier_model(X, y, model=LogisticRegression, param_grid=param_grid)
feature_importances_df

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mejores hiperparámetros encontrados GridSearchCV:
{'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}

Reporte de Clasificación en el conjunto de prueba:
              precision    recall  f1-score   support

           0       0.63      0.85      0.72       442
           1       0.68      0.39      0.50       359

    accuracy                           0.64       801
   macro avg       0.66      0.62      0.61       801
weighted avg       0.65      0.64      0.62       801

F1 Macro Score de validación cruzada (train): 0.598757564748194
F1 Macro Score en el conjunto de prueba (test): 0.6109960892569588


Unnamed: 0,Característica,Importancia
0,component_0,1.509771
1,component_1,0.489626
2,component_2,0.355707
7,weekday_M,0.090896
8,weekday_S,0.090675
11,hours_group_0_a_10,0.027328
14,hours_group_17_a_23,-0.004068
13,hours_group_14_a_17,-0.015225
12,hours_group_10_a_14,-0.019697
10,weekday_X,-0.039553


In [45]:
param_grid = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200]
}

feature_importances_df = train_classifier_model(X, y, model=XGBClassifier, param_grid=param_grid)
feature_importances_df

Mejores hiperparámetros encontrados GridSearchCV:
{'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}

Reporte de Clasificación en el conjunto de prueba:
              precision    recall  f1-score   support

           0       0.74      0.78      0.76       442
           1       0.71      0.67      0.69       359

    accuracy                           0.73       801
   macro avg       0.73      0.72      0.72       801
weighted avg       0.73      0.73      0.73       801

F1 Macro Score de validación cruzada (train): 0.7228438774405087
F1 Macro Score en el conjunto de prueba (test): 0.7242597542693519


Unnamed: 0,Característica,Importancia
2,component_2,0.09632
8,weekday_S,0.088129
1,component_1,0.086113
0,component_0,0.083031
3,message_lenght,0.078756
9,weekday_V,0.067603
11,hours_group_0_a_10,0.066868
12,hours_group_10_a_14,0.061286
7,weekday_M,0.058354
4,weekday_D,0.055809


### Linear Regression

In [10]:
dummies_df = pd.get_dummies(df["user"], dtype=int)
df = pd.concat([df, dummies_df], axis=1)


mask = [
    "raw_text",
    "date",
    "hours_minutes",
    "user",
    "message",
    "weekday",
    "message_lenght",
    "hour",
    "minutes",
    "hours_group",
    "encoded_user"
]

X = df[df.columns[df.columns.isin(mask) == False]]
y = df["message_lenght"]

In [11]:
feature_importances_df = train_regressor_model(X, y, LinearRegression, None)
feature_importances_df

R2 train: 0.018022722827743254
R2 test: 0.02114537523611648


Unnamed: 0,Característica,Importancia,Coef
11,User_1,705759200000000.0,-705759200000000.0
12,User_2,705759200000000.0,-705759200000000.0
3,weekday_M,111794400000000.0,-111794400000000.0
4,weekday_S,111794400000000.0,-111794400000000.0
6,weekday_X,111794400000000.0,-111794400000000.0
2,weekday_L,111794400000000.0,-111794400000000.0
0,weekday_D,111794400000000.0,-111794400000000.0
1,weekday_J,111794400000000.0,-111794400000000.0
5,weekday_V,111794400000000.0,-111794400000000.0
9,hours_group_14_a_17,2.28125,-2.28125


In [12]:
param_grid = {
    'alpha': np.logspace(-6, 5, 21),  # Probando 13 valores de alpha de 10^-3 a 10^3
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    'tol': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
    'fit_intercept': [True]
}

feature_importances_df = train_regressor_model(X, y, Ridge, param_grid=param_grid)
feature_importances_df

Mejores hiperparámetros encontrados:
{'alpha': 50.11872336272735, 'fit_intercept': True, 'solver': 'lsqr', 'tol': 0.1}
R2 train: 0.023043085258243758
R2 test: 0.024475648762954827


Unnamed: 0,Característica,Importancia,Coef
12,User_2,3.080625,3.080625
11,User_1,3.080625,-3.080625
5,weekday_V,0.779306,0.779306
9,hours_group_14_a_17,0.704017,-0.704017
3,weekday_M,0.455679,-0.455679
8,hours_group_10_a_14,0.282919,0.282919
6,weekday_X,0.22041,-0.22041
7,hours_group_0_a_10,0.213562,0.213562
1,weekday_J,0.203678,0.203678
10,hours_group_17_a_23,0.189727,0.189727


In [13]:
param_grid = {
    'alpha': np.logspace(-5, 5, 20),  # Más valores de alpha entre 10^-5 y 10^5
    'fit_intercept': [True, False],
    'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]  # Más valores para la tolerancia
}


feature_importances_df = train_regressor_model(X, y, Lasso, param_grid=param_grid)
feature_importances_df

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Mejores hiperparámetros encontrados:
{'alpha': 0.1623776739188721, 'fit_intercept': True, 'tol': 0.001}
R2 train: 0.02201790160488141
R2 test: 0.025530430449225094


Unnamed: 0,Característica,Importancia,Coef
11,User_1,5.581849,-5.581849
5,weekday_V,1.515013,1.515013
9,hours_group_14_a_17,0.8608217,-0.8608217
1,weekday_J,0.3330508,0.3330508
7,hours_group_0_a_10,0.141228,0.141228
3,weekday_M,0.126819,-0.126819
12,User_2,6.138811e-16,6.138811e-16
0,weekday_D,0.0,-0.0
2,weekday_L,0.0,-0.0
4,weekday_S,0.0,-0.0


In [14]:
param_grid_sgd = {
    "loss": ["squared_loss", "huber"],
    "penalty": ["l2", "l1", "elasticnet"],
    "alpha": [x/100 for x in range(100)],
    "epsilon": [x/10 for x in range(10)],
    "learning_rate": ["constant", "invscaling", "adaptive"],
}

feature_importances_df = train_regressor_model(X, y, SGDRegressor, param_grid=param_grid)
feature_importances_df

Mejores hiperparámetros encontrados:
{'alpha': 0.04832930238571752, 'fit_intercept': True, 'tol': 0.0001}
R2 train: 0.02240601330541896
R2 test: 0.024340919665617577


Unnamed: 0,Característica,Importancia,Coef
12,User_2,4.795685,4.795685
5,weekday_V,2.37458,2.37458
7,hours_group_0_a_10,1.711336,1.711336
1,weekday_J,1.470408,1.470408
10,hours_group_17_a_23,1.142151,1.142151
11,User_1,1.070623,-1.070623
8,hours_group_10_a_14,0.866019,0.866019
3,weekday_M,0.505218,-0.505218
9,hours_group_14_a_17,0.275856,-0.275856
0,weekday_D,0.265124,0.265124
