#
# Clasificacion basica detexto
#

#
# Carga del dataset
#

In [None]:
import pandas as pd

dataframe = pd.read_csv(
    "sentences.csv.zip",
    index_col=False,
    compression="zip",
)

#
# Exploración de los datos
#

In [ ]:
dataframe.shap

#
# Data
#

In [ ]:
dataframe.head()

#
# Sentimientos (clases)
#

In [ ]:
dataframe.target.value_counts()

#
# Ejemplos de frases positivas
#

In [ ]:
for i in range(5):
    print(dataframe[dataframe.target == "positive"]["phrase"].iloc[i])

#
# Ejemplos de frases neutras
#

In [ ]:
for i in range(5):
    print(dataframe[dataframe.target == "neutral"]["phrase"].iloc[i])

#
# Particionamiento de datos
#

In [ ]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    dataframe.phrase,
    dataframe.target,
    test_size=0.3,
    shuffle=False,
)

#
# Preprocesamiento de texto
#

In [ ]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    lowercase=True,
    analyzer="word",
    token_pattern=r"\b[a-zA-Z]\w+\b",
    stop_words="english",
    max_df=0.99,
    min_df=2,
    binary=True,
)
# stop words es azucar sintactico
# max df toma la frecuencia, palabras que aparezcan 99% las descarto, decimal es porcentaje, y palabras que aparecen dos veces o menos tampoco sirven
vectorizer.fit(X_train)
# el fit hace conteo de palabras y construye el vector de palabras que van en las columnsa

#
#Columnas de la matriz documento-termino
#

In [ ]:
vectorizer.get_feature_names_out()

#
# Especificación del modelo
#

In [ ]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)
clf

#
# Entrenamiento del modelo
#

In [ ]:
X_train_vectorized = vectorizer.transform(X_train)

clf.fit(
    X_train_vectorized,
    y_train,
)

#
# Evaluación de la precisión del modelo
#

In [ ]:
from sklearn.metrics import accuracy_score

#
# Muestra de entrenamiento
#

In [ ]:
accuracy_score(
    y_true=y_train,
    y_pred=clf.predict(X_train_vectorized),
)

#
# Muestra de prueba
#

In [ ]:
X_test_vectorized = vectorizer.transform(X_test)
predictions = clf.predict(X_test_vectorized)

accuracy_score(
    y_true=y_test,
    y_pred=predictions,
)

#
# Almacenamiento del modelo
#

In [ ]:
import pickle

with open("clf.pickle", "wb") as file:
    pickle.dump(clf, file)

with open("vectorizer.pickle", "wb") as file:
    pickle.dump(vectorizer, file)

#
# Matriz de confusión
#

In [ ]:
from sklearn.metrics import ConfusionMatrixDisplay

disp = ConfusionMatrixDisplay.from_predictions(
    y_test,
    predictions,
    cmap="Greens",
)
disp.figure_.suptitle("Confusion Matrix")
print(f"Confusion matrix:\n{disp.confusion_matrix}")

#
# Uso del modelo
#

In [ ]:
with open("clf.pickle", "rb") as file:
    new_clf = pickle.load(file)

with open("vectorizer.pickle", "rb") as file:
    new_vectorizer = pickle.load(file)

accuracy_score(
    y_true=dataframe.target,
    y_pred=new_clf.predict(new_vectorizer.transform(dataframe.phrase)),
)