## Natural Language Processing

## 1. Reading the dataset

In [None]:
!pip install --upgrade pip
!pip install "scikit-learn<1.7,>=1.2"

In [None]:
import pandas as pd
df = pd.read_csv("AMAZON-REVIEW-DATA-CLASSIFICATION.csv")
print('The shape of the dataset is:', df.shape)

In [None]:
df.head(5)

In [None]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', None)
df.head()

In [None]:
print(df.loc[[580]])

In [None]:
df.dtypes

## 2. Exploratory Data Analysis

In [None]:
df['isPositive'].value_counts()

In [None]:
df = df.replace({0:1, 1:0})
df['isPositive'].value_counts()

In [None]:
df.isna().sum()

## 3. Text Processing: Stop words removal and stemming

In [None]:
# Install the library and functions
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

In [None]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Stop words
stop = stopwords.words('english')

# Words we want to keep
excluding = [
    'against', 'not', 'don', "don't", 'ain', 'are', "aren't", 'could', "couldn't",
    'did', "didn't", 'does', "doesn't", 'had', "hadn't", 'has', "hasn't",
    'have', "haven't", 'is', "isn't", 'might', "mightn't", 'must', "mustn't",
    'need', "needn't", 'should', "shouldn't", 'was', "wasn't", 'were', "weren't",
    "won't", 'would', "wouldn't"
]

# Filtered stop words
stop_words = [word for word in stop if word not in excluding]

# Stemmer
snow = SnowballStemmer('english')

# Function to clean text
def process_text(texts):
    final_text_list = []

    for sent in texts:
        # Handle missing values
        if not isinstance(sent, str):
            sent = ''

        filtered_sentence = []

        # Basic cleaning
        sent = sent.lower()
        sent = sent.strip()
        sent = re.sub(r'\s+', ' ', sent)
        sent = re.compile(r'<.*?>').sub('', sent)

        # Tokenization and filtering
        for w in word_tokenize(sent):
            if (not w.isnumeric()) and (len(w) > 2) and (w not in stop_words):
                filtered_sentence.append(snow.stem(w))

        # Join tokens back into a string
        final_string = " ".join(filtered_sentence)
        final_text_list.append(final_string)

    return final_text_list


## 4. Training, Validation, and Test Split

In [None]:
from sklearn.model_selection import train_test_split

# First split: train and temp (validation + test)
X_train, X_val, y_train, y_val = train_test_split(
    df[['reviewText', 'summary', 'time', 'log_votes']],  # Features
    df['isPositive'],                                     # Target
    test_size=0.20,                                       # 20% for validation + test
    shuffle=True,
    random_state=324
)

# Second split: validation and test from temp
X_val, X_test, y_val, y_test = train_test_split(
    X_val,       # Features from previous split
    y_val,       # Target from previous split
    test_size=0.5,  # Split 50-50 to get equal validation and test sets
    shuffle=True,
    random_state=324
)


In [None]:
print('Processing the reviewText fields')
X_train['reviewText'] = process_text(X_train['reviewText'].tolist())
X_val['reviewText'] = process_text(X_val['reviewText'].tolist())
X_test['reviewText'] = process_text(X_test['reviewText'].tolist())

print('Processing the summary fields')
X_train['summary'] = process_text(X_train['summary'].tolist())
X_val['summary'] = process_text(X_val['summary'].tolist())
X_test['summary'] = process_text(X_test['summary'].tolist())


## 5. Data processing with Pipeline and ColumnTransform

In [None]:
# Grab model features/inputs and target/output
numerical_features = ['time', 'log_votes']
text_features = ['summary', 'reviewText']
model_features = numerical_features + text_features
model_target = 'isPositive'

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

### COLUMN TRANSFORMER ###
##########################

# Preprocess the numerical features
numerical_processor = Pipeline([
    ('num_imputer', SimpleImputer(strategy='mean')),
    ('num_scaler', MinMaxScaler())
])

# Preprocess 1st text feature (summary)
text_processor_0 = Pipeline([
    ('text_vect_0', CountVectorizer(binary=True, max_features=50))
])

# Preprocess 2nd text feature (reviewText)
text_processor_1 = Pipeline([
    ('text_vect_1', CountVectorizer(binary=True, max_features=150))
])

# Combine all data preprocessors
# Each processor: name, pipeline, and features to process
data_preprocessor = ColumnTransformer([
    ('numerical_pre', numerical_processor, numerical_features),
    ('text_pre_0', text_processor_0, text_features[0]),
    ('text_pre_1', text_processor_1, text_features[1])
])

### DATA PREPROCESSING ###
##########################

print(
    'Datasets shapes before processing: ',
    X_train.shape,
    X_val.shape,
    X_test.shape
)

# Fit and transform the training set, transform validation and test sets
X_train = data_preprocessor.fit_transform(X_train).toarray()
X_val = data_preprocessor.transform(X_val).toarray()
X_test = data_preprocessor.transform(X_test).toarray()

print(
    'Datasets shapes after processing: ',
    X_train.shape,
    X_val.shape,
    X_test.shape
)


## 6. Train a classifier and build algorithm

In [None]:
!pip install xgboost

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Define the XGBoost classifier
linear_classifier = xgb.XGBClassifier(
    objective='binary:logistic',  # for binary classification
    n_estimators=100,              # number of boosting rounds
    max_depth=5,                   # maximum tree depth
    learning_rate=0.1,             # step size shrinkage
    subsample=0.8,                 # fraction of samples per tree
    colsample_bytree=0.8,          # fraction of features per tree
    random_state=324               # for reproducibility
)

# Fit the model on training data
linear_classifier.fit(
    X_train.astype('float32'),
    y_train.values.astype('float32')
)

# Predict on validation and test sets
y_val_pred = linear_classifier.predict(X_val.astype('float32'))
y_test_pred = linear_classifier.predict(X_test.astype('float32'))

# Evaluate accuracy
val_acc = accuracy_score(y_val, y_val_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print("Validation accuracy:", val_acc)
print("Test accuracy:", test_acc)
