## Imports

In [None]:
import numpy as np
import re
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import FastText
import matplotlib.pyplot as plt
# Download NLTK resources
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Cargar Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("stanfordnlp/imdb")
df = dataset["train"].to_pandas()
print("Número de ejemplos del dataset:", len(df))

In [None]:
df.head(5)

In [None]:
print(df.text[270][:46])
print('neg' if df.label[270] == 0 else 'pos'  )

In [None]:
print(df.text[14500][103:174])
print('neg' if df.label[14500] == 0 else 'pos'  )

In [None]:
import seaborn as sns
sns.countplot(x='label',data=df)


# Preprocesado de texto

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]*>', '', text)
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenization and lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

In [None]:
stop_words_list = list(stop_words)
stop_words_list[30:50]


In [None]:
random = df.sample(n=1)
print(preprocess_text(random.text.values[0]))
print(random.text.values[0])

In [None]:
from wordcloud import WordCloud
#positive_text = [preprocess_text(text) for text in df[df.label == 1].text]

plt.figure(figsize=(10,10))

WC=WordCloud(width=1000,height=500,max_words=1000,min_font_size=10,max_font_size=95,background_color='white',colormap='magma')
positive_words=WC.generate(' '.join(positive_text))
plt.imshow(positive_words,interpolation='bilinear')
plt.axis('off')

plt.show

In [None]:
negative_text = [preprocess_text(text) for text in df[df.label == 0].text]
plt.figure(figsize=(10,10))

WC=WordCloud(width=1000,height=500,max_words=1000,min_font_size=10,max_font_size=95,background_color='white',colormap='magma')
negative_text_words=WC.generate(' '.join(negative_text))
plt.imshow(negative_text_words,interpolation='bilinear')
plt.axis('off')
plt.show

# Vectorizing


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_features(X_train, X_test):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit(X_train)
    x_train_tfidf = tfidf_matrix.transform(X_train)
    x_test_tfidf = tfidf_matrix.transform(X_test)
    return x_train_tfidf, x_test_tfidf

In [None]:
from sentence_transformers import SentenceTransformer

def embeddings_features(X_train, X_test):
    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    X_train_embeddings = model.encode(X_train)
    X_test_embeddings = model.encode(X_test)
    return X_train_embeddings, X_test_embeddings

In [None]:
from sklearn.model_selection import train_test_split

X_preprocessed = [preprocess_text(text) for text in df.text]

train_test_sets = []

for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, df.label, test_size=0.05, train_size=0.2, random_state=i, shuffle=True)
    train_test_sets.append((X_train, X_test, y_train, y_test))

# Comparador clasificadores

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt


# Adjusting the function to accept pre-split data
def compare_classifiers(X_train, X_test, y_train, y_test):

    # List of classifiers to compare
    classifiers = [
        ('Logistic Regression', LogisticRegression(random_state=42)),
        ('Naive Bayes', MultinomialNB()),
        ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('KNN', KNeighborsClassifier(n_neighbors=5))
    ]
    
    # Iterate over classifiers, train, predict, and display metrics

    for name, clf in classifiers:
        print(f'----- {name} -----')
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        # Classification Report
        print('Classification Report:')
        print(classification_report(y_test, y_pred))
        
        # Confusion Matrix
        print('Confusion Matrix:')
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                            display_labels=clf.classes_)
        disp.plot()
        plt.show()
        print('\n')


# The function now expects four parameters:
# X_train: The training data (text)
# X_test: The test data (text)
# y_train: The training labels
# y_test: The test labels

# Example usage (assuming pre-split data):
# compare_classifiers(X_train, X_test, y_train, y_test)



In [26]:
import tensorflow.keras as keras
from keras.layers import Dropout, Conv1D, GlobalMaxPooling1D, Dense, Reshape
import numpy as np

# Define la arquitectura de la red neuronal
model = keras.Sequential([

    keras.layers.Dense(5, activation='relu'),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(1, activation='sigmoid')  # Capa de salida con activación sigmoide
])

# Compila el modelo
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in np.arange(0, 1, 0.01):
        score = accuracy_score(y_true, (y_proba > threshold).astype(int))
        if score > best_score:
            best_threshold = threshold
            best_score = score
    return best_threshold, best_score

def train_nn(model, X_train, y_train, X_test, y_test):
    # Train the model with the validation callback
    model.fit(X_train, y_train, epochs=25, batch_size=32, validation_data=(X_test, y_test), callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)],verbose=0)
    
    # Predict on the test data
    y_pred = model.predict(X_test)
    
    # Search for the optimal threshold
    threshold, acc = threshold_search(y_test, y_pred)
    
    # Define the custom rounding function
    custom_round = lambda n: int(n) if n - int(n) < threshold else int(n) + 1
    
    # Print the classification report
    return([custom_round(pred) for pred in y_pred])

In [None]:
        '''
        ('Logisti Regression', LogisticRegression(random_state=42)),
        ('Gaussian Naive Bayes', GaussianNB()),
        ('Rando Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('KNN', KNeighborsClassifier(n_neighbors=5)),'''

In [35]:
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

def compare_classifiers(X_train, X_test, y_train, y_test,method='tfidf'):
    classifiers = [

        ('Neural Network', model),
    ]
    results = {}
    for name, clf in classifiers:
        if name == 'Neural Network':
            if method == 'tfidf':
                continue
            else:
                y_pred = train_nn(clf,X_train, y_train, X_test, y_test)

        else:    
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
        
        precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
        
        if name not in results:
            results[name] = {'precision': [], 'recall': [], 'fscore': []}
        results[name]['precision'].append(precision)
        results[name]['recall'].append(recall)
        results[name]['fscore'].append(fscore)
    return results

# Initialize a dictionary to hold all results
def compare_seeds(train_test_sets, method='tfidf'):
    all_seeds_results = {}

    for i, (X_train, X_test, y_train, y_test) in enumerate(train_test_sets):
        print(f"---- Random Seed: {i} ----")
        if method == 'tfidf':
            X_train, X_test = tfidf_features(X_train, X_test)
        else:
            X_train, X_test = embeddings_features(X_train, X_test)
        seed_results = compare_classifiers(X_train, X_test, y_train, y_test,method)

        for clf_name, metrics in seed_results.items():
            if clf_name not in all_seeds_results:
                all_seeds_results[clf_name] = {'precision': [], 'recall': [], 'fscore': []}
            
                
            # Accumulate results
            for metric_name, metric_values in metrics.items():
                all_seeds_results[clf_name][metric_name] += metric_values

        # Compute and print averages
    for clf_name, metrics in all_seeds_results.items():
        print(f'----- {clf_name} Average Metrics -----')
        for metric_name, metric_values in metrics.items():
            print(f'{metric_name.capitalize()} Avg: {np.mean(metric_values):.4f}')
        print('\n')

In [36]:
compare_seeds(train_test_sets,'embeddings')

---- Random Seed: 0 ----
---- Random Seed: 1 ----
---- Random Seed: 2 ----


KeyboardInterrupt: 

In [None]:
X_train, X_test, y_train, y_test = train_test_sets[0]
X_train, X_test = embeddings_features(X_train, X_test)

In [None]:
y_pred=train_nn(model,X_train, y_train, X_test, y_test)




[1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

def find_best_knn_parameters(X_train, y_train):
    # Define the parameter grid
    param_grid = {
        'n_neighbors': [ 7, 9, 11, 13, 15, 20,35],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }

    # Create the KNN classifier
    knn = KNeighborsClassifier()

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(knn, param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Print the best parameters and the corresponding accuracy score
    print("Best Parameters: ", grid_search.best_params_)
    print("Best Accuracy: ", grid_search.best_score_)

find_best_knn_parameters(x_train_tfidf, y_train)


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

def svm_grid_search(X_train, X_test, y_train, y_test, param_grid):
    """
    Realiza una búsqueda de grilla para encontrar los mejores hiperparámetros para un modelo SVM.
    
    Args:
        X_train (array-like): Conjunto de datos de entrenamiento.
        X_test (array-like): Conjunto de datos de prueba.
        y_train (array-like): Etiquetas de entrenamiento.
        y_test (array-like): Etiquetas de prueba.
        param_grid (dict): Diccionario con los hiperparámetros a probar en la grilla.
    
    Returns:
        best_params (dict): Mejores hiperparámetros encontrados por la búsqueda de grilla.
        best_score (float): Mejor puntuación de precisión obtenida con los mejores hiperparámetros.
        test_accuracy (float): Precisión del modelo en el conjunto de prueba utilizando los mejores hiperparámetros.
        svm_model: Modelo SVM ajustado con los mejores hiperparámetros.
    """
    # Inicializa un clasificador SVM
    svm = SVC()

    # Realiza la búsqueda de grilla
    grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Obtiene los mejores hiperparámetros y la mejor puntuación
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    # Entrena un modelo SVM con los mejores hiperparámetros
    best_svm_model = SVC(**best_params)
    best_svm_model.fit(X_train, y_train)

    # Calcula la precisión en el conjunto de prueba
    test_predictions = best_svm_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_predictions)
    print("Best Parameters: ", grid_search.best_params_)
    print("Best Accuracy: ", grid_search.best_score_)

    return best_params, best_score, test_accuracy, best_svm_model

svm_grid_search(x_train_tfidf,  x_test_tfidf,y_train, y_test, {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10], 'kernel': ['linear', 'rbf']})



In [None]:
compare_classifiers(embeddings, X_test_embedding, y_train, y_test)