In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import zipfile
import tarfile
import pandas as pd

# Path al archivo
file_path = '/content/drive/MyDrive/amazon_review_polarity_csv.tgz.zip'

# Crear directorio temporal si no existe
temp_dir = '/content/temp_data'
if not os.path.exists(temp_dir):
    os.makedirs(temp_dir)

# Descomprimir el ZIP
print("Descomprimiendo archivo ZIP...")
with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall(temp_dir)

# Buscar el archivo .tgz
tgz_file = None
for file in os.listdir(temp_dir):
    if file.endswith('.tgz'):
        tgz_file = os.path.join(temp_dir, file)
        break

if tgz_file:
    print(f"Descomprimiendo archivo TGZ: {tgz_file}")
    with tarfile.open(tgz_file, 'r:gz') as tar:
        tar.extractall(temp_dir)

# Buscar y cargar el CSV
csv_files = []
for root, dirs, files in os.walk(temp_dir):
    for file in files:
        if file.endswith('.csv'):
            csv_files.append(os.path.join(root, file))

if csv_files:
    print("\nArchivos CSV encontrados:")
    for i, file in enumerate(csv_files):
        print(f"{i}: {file}")

Descomprimiendo archivo ZIP...
Descomprimiendo archivo TGZ: /content/temp_data/amazon_review_polarity_csv.tgz

Archivos CSV encontrados:
0: /content/temp_data/amazon_review_polarity_csv/test.csv
1: /content/temp_data/amazon_review_polarity_csv/train.csv


In [None]:
df1 = pd.read_csv('/content/temp_data/amazon_review_polarity_csv/train.csv', header=None)
df2 = pd.read_csv('/content/temp_data/amazon_review_polarity_csv/test.csv', header=None)

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import seaborn as sns

class AmazonReviewAnalyzer:
    def __init__(self):
        self.tfidf = TfidfVectorizer(max_features=10000)
        self.tokenizer = Tokenizer(num_words=10000)
        self.maxlen = 200
        self.n_splits = 5

    def load_and_preprocess_data(self, temp_dir, train_size=48000, test_size=12000):
        """Load and preprocess the Amazon reviews dataset"""
        # Load data with specified sizes
        columns = ["sentiment", "title", "review"]

        train_data = pd.read_csv(f"{temp_dir}/amazon_review_polarity_csv/train.csv",
                                names=columns, nrows=train_size)
        test_data = pd.read_csv(f"{temp_dir}/amazon_review_polarity_csv/test.csv",
                               names=columns, nrows=test_size)

        # Combine title and review
        train_data['full_text'] = train_data['title'].fillna('') + ' ' + train_data['review']
        test_data['full_text'] = test_data['title'].fillna('') + ' ' + test_data['review']

        # Preprocess text
        train_data['processed_text'] = train_data['full_text'].apply(self._preprocess_text)
        test_data['processed_text'] = test_data['full_text'].apply(self._preprocess_text)

        # Convert labels (2 -> 1, 1 -> 0)
        train_data['sentiment'] = train_data['sentiment'].map({2: 1, 1: 0})
        test_data['sentiment'] = test_data['sentiment'].map({2: 1, 1: 0})

        return train_data, test_data

    def _preprocess_text(self, text):
        """Clean and preprocess text data"""
        if pd.isna(text):
            return ""
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def build_dnn_model(self):
        """Create and compile the DNN model"""
        model = tf.keras.Sequential([
            tf.keras.layers.Embedding(10000, 128, input_length=self.maxlen),
            tf.keras.layers.GlobalAveragePooling1D(),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])

        model.compile(optimizer='adam',
                     loss='binary_crossentropy',
                     metrics=['accuracy'])
        return model

    def cross_validate_models(self, data):
        """Perform k-fold cross-validation on both models"""
        kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=42)

        print("\n" + "="*80)
        print(" "*30 + "K-FOLD CROSS VALIDATION RESULTS")
        print("="*80 + "\n")

        for fold, (train_idx, val_idx) in enumerate(kf.split(data)):
            print(f"\nFold {fold + 1}/{self.n_splits}")
            print("="*80)

            # Split data
            X_train = data['processed_text'].iloc[train_idx]
            y_train = data['sentiment'].iloc[train_idx]
            X_val = data['processed_text'].iloc[val_idx]
            y_val = data['sentiment'].iloc[val_idx]

            # Traditional Model
            print("\nTraining Logistic Regression...")
            X_train_tfidf = self.tfidf.fit_transform(X_train)
            X_val_tfidf = self.tfidf.transform(X_val)

            lr_model = LogisticRegression(max_iter=1000)
            lr_model.fit(X_train_tfidf, y_train)
            lr_pred = lr_model.predict(X_val_tfidf)

            print("\n" + "-"*30 + " LOGISTIC REGRESSION CLASSIFICATION " + "-"*30)
            print(classification_report(y_val, lr_pred))

            # DNN Model
            print("\nTraining DNN...")
            self.tokenizer.fit_on_texts(X_train)
            X_train_seq = self.tokenizer.texts_to_sequences(X_train)
            X_val_seq = self.tokenizer.texts_to_sequences(X_val)

            X_train_pad = pad_sequences(X_train_seq, maxlen=self.maxlen)
            X_val_pad = pad_sequences(X_val_seq, maxlen=self.maxlen)

            dnn_model = self.build_dnn_model()
            dnn_model.fit(X_train_pad, y_train,
                        epochs=3,
                        batch_size=32,
                        verbose=1)

            dnn_pred = (dnn_model.predict(X_val_pad) > 0.5).astype(int)

            print("\n" + "-"*30 + " DEEP NEURAL NETWORK CLASSIFICATION " + "-"*30)
            print(classification_report(y_val, dnn_pred))

analyzer = AmazonReviewAnalyzer()
train_data, test_data = analyzer.load_and_preprocess_data(temp_dir)
analyzer.cross_validate_models(train_data)


                              K-FOLD CROSS VALIDATION RESULTS


Fold 1/5

Training Logistic Regression...

------------------------------ LOGISTIC REGRESSION CLASSIFICATION ------------------------------
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      4621
           1       0.89      0.89      0.89      4979

    accuracy                           0.88      9600
   macro avg       0.88      0.88      0.88      9600
weighted avg       0.88      0.88      0.88      9600


Training DNN...
Epoch 1/3




[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.6396 - loss: 0.5931
Epoch 2/3
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8686 - loss: 0.3144
Epoch 3/3
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.8949 - loss: 0.2604
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step

------------------------------ DEEP NEURAL NETWORK CLASSIFICATION ------------------------------
              precision    recall  f1-score   support

           0       0.87      0.90      0.88      4621
           1       0.90      0.87      0.89      4979

    accuracy                           0.88      9600
   macro avg       0.88      0.89      0.88      9600
weighted avg       0.89      0.88      0.88      9600


Fold 2/5

Training Logistic Regression...

------------------------------ LOGISTIC REGRESSION CLASSIFICATION ------------------------------
      



[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.6213 - loss: 0.6111
Epoch 2/3
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8666 - loss: 0.3175
Epoch 3/3
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8902 - loss: 0.2710
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step

------------------------------ DEEP NEURAL NETWORK CLASSIFICATION ------------------------------
              precision    recall  f1-score   support

           0       0.92      0.84      0.87      4733
           1       0.85      0.93      0.89      4867

    accuracy                           0.88      9600
   macro avg       0.88      0.88      0.88      9600
weighted avg       0.88      0.88      0.88      9600


Fold 3/5

Training Logistic Regression...

------------------------------ LOGISTIC REGRESSION CLASSIFICATION ------------------------------
      



[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.6602 - loss: 0.5763
Epoch 2/3
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8769 - loss: 0.3004
Epoch 3/3
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8944 - loss: 0.2621
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

------------------------------ DEEP NEURAL NETWORK CLASSIFICATION ------------------------------
              precision    recall  f1-score   support

           0       0.84      0.93      0.88      4741
           1       0.92      0.83      0.87      4859

    accuracy                           0.88      9600
   macro avg       0.88      0.88      0.88      9600
weighted avg       0.88      0.88      0.88      9600


Fold 4/5

Training Logistic Regression...

------------------------------ LOGISTIC REGRESSION CLASSIFICATION ------------------------------
      



[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.6326 - loss: 0.6039
Epoch 2/3
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8748 - loss: 0.3041
Epoch 3/3
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8930 - loss: 0.2642
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step

------------------------------ DEEP NEURAL NETWORK CLASSIFICATION ------------------------------
              precision    recall  f1-score   support

           0       0.91      0.84      0.88      4734
           1       0.86      0.92      0.89      4866

    accuracy                           0.88      9600
   macro avg       0.89      0.88      0.88      9600
weighted avg       0.89      0.88      0.88      9600


Fold 5/5

Training Logistic Regression...

------------------------------ LOGISTIC REGRESSION CLASSIFICATION ------------------------------
      



[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.6328 - loss: 0.6012
Epoch 2/3
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8656 - loss: 0.3200
Epoch 3/3
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8892 - loss: 0.2769
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

------------------------------ DEEP NEURAL NETWORK CLASSIFICATION ------------------------------
              precision    recall  f1-score   support

           0       0.91      0.86      0.88      4745
           1       0.87      0.92      0.89      4855

    accuracy                           0.89      9600
   macro avg       0.89      0.89      0.89      9600
weighted avg       0.89      0.89      0.89      9600

