# DATASET SELECCIONADO: Yellow_Submarine.csv
---

In [1]:
#Importar Librerias
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pandas as pd

# Cargar el dataset
file_path = 'Yellow_Submarine.csv'
dataset = pd.read_csv(file_path)
dataset.describe()



Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,edible,convex,scaly,brown,no,none,free,close,broad,buff,...,smooth,white,white,partial,white,one,pendant,white,several,woods
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [None]:
dataset.info()

In [2]:
# Chequeamos que nuestro dataset no tenga valores nulos ni duplicados
dataset.select_dtypes('object').info()
dataset.drop_duplicates(keep = False, inplace = True)
dataset.dropna(how='any', inplace=True)
dataset.shape

# Codificamos la variable objetivo (class) en valores numéricos
label_encoder = LabelEncoder()
dataset['class'] = label_encoder.fit_transform(dataset['class'])

# Codificamos las en tipo categóricas
categorical_features = dataset.drop(columns=['class']).select_dtypes('object').columns
dataset_encoded = pd.get_dummies(dataset, columns=categorical_features, drop_first=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [3]:
# Seleccionamos las 10 características más importantes
X = dataset_encoded.drop(columns=['class'])  # Característica predictora
y = dataset_encoded['class']  # Variable objetivo

# Usamos un modelo RandomForest para determinar las caracteristicas más relevantes
model = RandomForestClassifier(
        n_estimators=100,  # Número de árboles
        max_depth=10,      # Profundidad máxima de los árboles
        min_samples_split=5,  # Mínimo de muestras para dividir un nodo
        min_samples_leaf=2,   # Mínimo de muestras en las hojas
        random_state=42
    )
model.fit(X, y)

# Obtenemos las 10 características más importantes
importances = pd.Series(model.feature_importances_, index=X.columns)
top_features = importances.nlargest(2).index


# Crear un nuevo conjunto de datos solo con las características seleccionadas
X_reduced = X[top_features]

print(top_features)




Index(['odor_none', 'gill-size_narrow'], dtype='object')


In [4]:
# 5. Dividimos el conjunto en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=42)

# 6. Inicializamos los modelos
models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=150,  # Número de árboles
        max_depth=10,      # Profundidad máxima de los árboles
        min_samples_split=5,  # Mínimo de muestras para dividir un nodo
        min_samples_leaf=2,   # Mínimo de muestras en las hojas
        random_state=42
    ),
    'Logistic Regression': LogisticRegression(max_iter=500),
    'SVM': SVC()
    
}

# Entrenamos y evaluamos los modelos
results = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)  # Entrenar
    y_pred = model.predict(X_test)  # Predecir
    results[model_name] = classification_report(y_test, y_pred, output_dict=True)

# Mostramos los resultados
results_df = pd.DataFrame({
    model_name: {
        'Precision': report['1']['precision'],
        'Recall': report['1']['recall'],
        'F1-Score': report['1']['f1-score'],
        'Accuracy': report['accuracy']
    }
    for model_name, report in results.items()
})



5) Resultados Obtenidos

In [5]:
print(results_df)

           Random Forest  Logistic Regression       SVM
Precision       0.812811             0.812811  0.812811
Recall          0.966977             0.966977  0.966977
F1-Score        0.883217             0.883217  0.883217
Accuracy        0.876128             0.876128  0.876128


In [37]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Cargar el dataset
file_path = "Yellow_Submarine.csv"
data = pd.read_csv(file_path)

# Generar texto combinando columnas categóricas
data["text"] = data[["cap-shape", "cap-surface", "cap-color", "odor"]].astype(str).agg(' '.join, axis=1)

# Etiquetas binarias (1 para "poisson", 0 para "edible")
data["label"] = data["class"].apply(lambda x: 1 if x == "poisson" else 0)

# Lista manual de stopwords en inglés
stop_words = {
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves',
    'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was',
    'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
    'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
    'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on',
    'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all',
    'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
    'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'
}

def preprocess_text(text):
    tokens = text.lower().split()
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(filtered_tokens)

try:
    data["processed_text"] = data["text"].apply(preprocess_text)
except Exception as e:
    print(f"Error procesando texto: {e}")
    raise

# Tarea 2 de NLP: Tokenización para modelo de Deep Learning
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data["processed_text"])

sequences = tokenizer.texts_to_sequences(data["processed_text"])
word_index = tokenizer.word_index

# Padding de las secuencias
max_length = max(len(seq) for seq in sequences)
data_padded = pad_sequences(sequences, maxlen=max_length)

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(data_padded, data["label"], test_size=0.2, random_state=42)

# Construir la red neuronal simple
model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=64, input_length=max_length),
    LSTM(64, return_sequences=False),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Entrenar la red neuronal
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# Evaluar el modelo
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Pérdida: {loss}, Precisión: {accuracy}")



Epoch 1/5




[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9337 - loss: 0.3740 - val_accuracy: 0.9846 - val_loss: 0.0521
Epoch 2/5
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9840 - loss: 0.0521 - val_accuracy: 0.9846 - val_loss: 0.0451
Epoch 3/5
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9857 - loss: 0.0426 - val_accuracy: 0.9846 - val_loss: 0.0387
Epoch 4/5
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9867 - loss: 0.0355 - val_accuracy: 0.9871 - val_loss: 0.0338
Epoch 5/5
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9865 - loss: 0.0352 - val_accuracy: 0.9865 - val_loss: 0.0352
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9887 - loss: 0.0256
Pérdida: 0.03518444299697876, Precisión: 0.9864615201950073
