In [None]:
import pandas as pd
import numpy as np

import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
import zipfile
import tarfile
import os

# Ruta de tu archivo zip
zip_path = 'amazon_review_polarity_csv.tgz.zip'
extracted_folder = '/content/Amazon Review Polarity'

# Primero, descomprimimos el archivo ZIP
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder)

# Luego, descomprimimos el archivo TAR.GZ que está dentro del ZIP
tgz_path = os.path.join(extracted_folder, 'amazon_review_polarity_csv.tgz')

with tarfile.open(tgz_path, 'r:gz') as tar_ref:
    tar_ref.extractall(extracted_folder)

print("Archivos descomprimidos exitosamente.")

Archivos descomprimidos exitosamente.


# Your mission: The Classifiers (I)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc  # Garbage collection. Memory Optimization

# Data Processing Libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Sklearn Preprocessing and Validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold

# Tensorflow and Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical

# Enable memory growth for GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

class MemoryEfficientTextProcessor:
    def __init__(self):
        # Download NLTK resources quietly
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        nltk.download('wordnet', quiet=True)

        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def clean_text(self, text):
        # Simplified text cleaning to reduce memory usage
        try:
            # Ensure text is string and lowercase
            text = str(text).lower()

            # Remove special characters and digits
            text = re.sub(r'[^a-z\s]', '', text)

            # Tokenize and basic filtering
            tokens = [
                self.lemmatizer.lemmatize(token)
                for token in text.split()
                if token not in self.stop_words and len(token) > 2
            ]

            return ' '.join(tokens)
        except Exception as e:
            print(f"Error processing text: {e}")
            return ''

class AmazonReviewClassifier:
    def __init__(self, max_features=5000, sample_size=50000):
        self.processor = MemoryEfficientTextProcessor()
        self.sample_size = sample_size
        self.tfidf = TfidfVectorizer(
            max_features=max_features,
            stop_words='english'
        )
        self.label_encoder = LabelEncoder()

    def load_and_preprocess_data(self, train_path, test_path):
        """
        Load and preprocess data with memory efficiency

        Args:
            train_path (str): Path to training data
            test_path (str): Path to test data

        Returns:
            tuple: Processed training and test data
        """
        # Load and sample training data
        df_train = pd.read_csv(train_path, header=None,
                                names=("polarity", "title", "text"),
                                usecols=[0, 2])
        df_train = df_train.sample(self.sample_size, random_state=42)

        # Load and sample test data
        df_test = pd.read_csv(test_path, header=None,
                               names=("polarity", "title", "text"),
                               usecols=[0, 2])
        df_test = df_test.sample(self.sample_size // 5, random_state=42)

        # Clean text in batches to reduce memory usage
        def batch_clean_text(df, batch_size=5000):
            cleaned_texts = []
            for i in range(0, len(df), batch_size):
                batch = df['text'].iloc[i:i+batch_size]
                cleaned_batch = batch.apply(self.processor.clean_text)
                cleaned_texts.extend(cleaned_batch)
                # Force garbage collection
                gc.collect()
            return cleaned_texts

        # Clean texts
        X_train_texts = batch_clean_text(df_train)
        X_test_texts = batch_clean_text(df_test)

        # Encode labels
        y_train = self.label_encoder.fit_transform(df_train['polarity'])
        y_test = self.label_encoder.transform(df_test['polarity'])

        return X_train_texts, X_test_texts, y_train, y_test

    def vectorize_texts(self, train_texts, test_texts):
        """
        Vectorize texts using TF-IDF

        Args:
            train_texts (list): Training texts
            test_texts (list): Test texts

        Returns:
            tuple: Vectorized training and test texts
        """
        # Fit on training data and transform both
        X_train_vectorized = self.tfidf.fit_transform(train_texts)
        X_test_vectorized = self.tfidf.transform(test_texts)

        return X_train_vectorized, X_test_vectorized

class LightweightModels:
    @staticmethod
    def logistic_regression(X_train, X_test, y_train, y_test):
        """
        Train and evaluate Logistic Regression with memory optimization
        """
        # Use less memory-intensive solver
        lr_model = LogisticRegression(
            multi_class='ovr',
            max_iter=500,
            solver='saga',  # More memory-efficient
            n_jobs=-1,
            verbose=0
        )

        # Train
        lr_model.fit(X_train, y_train)

        # Predict
        y_pred = lr_model.predict(X_test)

        # Report
        print("\nLogistic Regression Results:")
        print(classification_report(y_test, y_pred))

        return lr_model, y_pred

    @staticmethod
    def lightweight_neural_network(X_train, X_test, y_train, y_test):
        """
        Train a lightweight neural network with memory optimizations
        """
        # Prepare data
        num_classes = len(np.unique(y_train))
        y_train_cat = to_categorical(y_train)
        y_test_cat = to_categorical(y_test)

        # Create lightweight model
        model = Sequential([
            Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
            Dropout(0.3),
            Dense(64, activation='relu'),
            Dropout(0.2),
            Dense(num_classes, activation='softmax')
        ])

        # Compile with efficient optimizer
        model.compile(
            optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )

        # Train with memory-efficient callbacks
        history = model.fit(
            X_train, y_train_cat,
            validation_split=0.2,
            epochs=30,
            batch_size=64,  # Smaller batch size
            callbacks=[
                EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
            ],
            verbose=0
        )

        # Evaluate
        _, accuracy = model.evaluate(X_test, y_test_cat, verbose=0)  # Corrected this line
        y_pred = np.argmax(model.predict(X_test), axis=1)

        print("\nNeural Network Results:")
        print(classification_report(y_test, y_pred))

        return model, y_pred

def main():
    # Paths to your dataset
    train_path = "/content/Amazon Review Polarity/amazon_review_polarity_csv/train.csv"
    test_path = "/content/Amazon Review Polarity/amazon_review_polarity_csv/test.csv"

    # Initialize classifier
    classifier = AmazonReviewClassifier(
        max_features=5000,  # Reduced features
        sample_size=50000   # Smaller sample
    )

    # Load and preprocess data
    X_train_texts, X_test_texts, y_train, y_test = classifier.load_and_preprocess_data(
        train_path, test_path
    )

    # Vectorize texts
    X_train_vectorized, X_test_vectorized = classifier.vectorize_texts(
        X_train_texts, X_test_texts
    )

    # Convert to dense array with lower precision
    X_train_dense = X_train_vectorized.toarray().astype(np.float32)
    X_test_dense = X_test_vectorized.toarray().astype(np.float32)

    # Train and evaluate models
    print("\n--- Logistic Regression ---")
    lr_model, lr_pred = LightweightModels.logistic_regression(
        X_train_dense, X_test_dense, y_train, y_test
    )

    print("\n--- Lightweight Neural Network ---")
    nn_model, nn_pred = LightweightModels.lightweight_neural_network(
        X_train_dense, X_test_dense, y_train, y_test
    )

    # Clear memory
    del X_train_texts, X_test_texts, X_train_vectorized, X_test_vectorized
    gc.collect()

if __name__ == "__main__":
    main()


--- Logistic Regression ---





Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.85      0.84      0.84      4972
           1       0.84      0.85      0.85      5028

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000


--- Lightweight Neural Network ---


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

Neural Network Results:
              precision    recall  f1-score   support

           0       0.83      0.85      0.84      4972
           1       0.85      0.83      0.84      5028

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000

