# Easy training flow

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Creamos un dataset simple con palabras etiquetadas como positivas o negativas
data = {'word': ['happy', 'joy', 'love', 'sad', 'angry', 'hate', 'excited', 'bored', 'delight', 'fear'],
        'label': ['positive', 'positive', 'positive', 'negative', 'negative', 'negative', 'positive', 'negative', 'positive', 'negative']}

df = pd.DataFrame(data)

print(df)

      word     label
0    happy  positive
1      joy  positive
2     love  positive
3      sad  negative
4    angry  negative
5     hate  negative
6  excited  positive
7    bored  negative
8  delight  positive
9     fear  negative


In [7]:
# Importar las librerías necesarias
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import movie_reviews
import random

# Descargar el corpus de nltk (si es la primera vez que lo usas)
nltk.download('movie_reviews')
nltk.download('punkt')

# 1. Cargar el dataset de IMDb desde NLTK
# El dataset de NLTK contiene reseñas de películas etiquetadas como 'pos' (positivo) o 'neg' (negativo)
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Barajar el dataset para que no haya sesgo de orden
random.shuffle(documents)

# Convertir las reseñas de listas de palabras a frases
reviews = [" ".join(words) for words, category in documents]
sentiments = [category for words, category in documents]

# Crear un DataFrame para facilitar el manejo
df = pd.DataFrame({'review': reviews, 'sentiment': sentiments})

# 2. Preprocesar el texto
# Convertimos las frases en vectores numéricos usando TF-IDF (mejor que Bag of Words para frases)
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Usamos las 5000 palabras más comunes
X = vectorizer.fit_transform(df['review'])  # X son las características (las reseñas vectorizadas)
y = df['sentiment']  # y es la variable objetivo (etiquetas de sentimientos)

# 3. Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Entrenar el modelo
model = LogisticRegression()
model.fit(X_train, y_train)

# 5. Evaluar el modelo
y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Classification Report:\n {classification_report(y_test, y_pred)}")

# 6. Hacer predicciones con frases nuevas
new_reviews = [
    "The movie was fantastic and had a great plot.",
    "I really hated the film, it was too boring and slow.",
    "The acting was mediocre but the story was excellent.",
    "One of the worst movies I have ever seen."
]

# Convertimos las nuevas frases a su representación numérica
X_new = vectorizer.transform(new_reviews)

# Hacemos predicciones
predictions = model.predict(X_new)

# Mostramos las predicciones
for review, prediction in zip(new_reviews, predictions):
    print(f"Review: '{review}' => Sentiment: {prediction}")


[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/david/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package punkt to /home/david/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Accuracy: 0.845
Classification Report:
               precision    recall  f1-score   support

         neg       0.86      0.83      0.85       206
         pos       0.83      0.86      0.84       194

    accuracy                           0.84       400
   macro avg       0.85      0.85      0.84       400
weighted avg       0.85      0.84      0.85       400

Review: 'The movie was fantastic and had a great plot.' => Sentiment: pos
Review: 'I really hated the film, it was too boring and slow.' => Sentiment: neg
Review: 'The acting was mediocre but the story was excellent.' => Sentiment: pos
Review: 'One of the worst movies I have ever seen.' => Sentiment: neg


In [9]:
model.predict_proba(X_new)

array([[0.42800558, 0.57199442],
       [0.60284464, 0.39715536],
       [0.43861451, 0.56138549],
       [0.65080868, 0.34919132]])

In [10]:
predictions

array(['pos', 'neg', 'pos', 'neg'], dtype=object)

In [11]:
import joblib

# Guardar el modelo en un archivo local usando joblib
joblib.dump(model, 'model.pkl')  # Guardamos el modelo
joblib.dump(vectorizer, 'vectorizer.pkl')  # Guardamos también el vectorizador

['vectorizer.pkl']