In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import string

In [2]:
data = pd.read_csv('/content/drive/MyDrive/AmazonReviews/Reviews.csv')

In [3]:
# Sample half of the data
sampled_data = data.sample(n=10000, random_state=3)

In [4]:
data.shape

(568454, 10)

In [5]:
sampled_data.shape

(10000, 10)

In [6]:
train_data, test_data = train_test_split(sampled_data, test_size=0.2, random_state=3)

In [7]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
# Define preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])

    # Stemming (you can also consider lemmatization)
    stemmer = PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])

    return text

In [9]:
sampled_data.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [10]:
# Apply preprocessing to train and test data
train_data['preprocessed_text'] = train_data['Text'].apply(preprocess_text)
test_data['preprocessed_text'] = test_data['Text'].apply(preprocess_text)

In [13]:
# Feature Extraction: BOW
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(train_data['preprocessed_text'])
X_test_bow = bow_vectorizer.transform(test_data['preprocessed_text'])

In [14]:
# Feature Extraction: n-grams
ngram_range = (1, 2)
ngram_vectorizer = CountVectorizer(ngram_range=ngram_range)
X_train_ngram = ngram_vectorizer.fit_transform(train_data['preprocessed_text'])
X_test_ngram = ngram_vectorizer.transform(test_data['preprocessed_text'])

In [15]:
# Feature Extraction: TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['preprocessed_text'])
X_test_tfidf = tfidf_vectorizer.transform(test_data['preprocessed_text'])

In [16]:
# Define models
lr_model = LogisticRegression(random_state=42)
svm_model = SVC(kernel='linear', random_state=42)


In [17]:
# Define hyperparameter grids for tuning
lr_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
svm_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

In [19]:
# GridSearchCV for hyperparameter tuning
lr_grid_search = GridSearchCV(lr_model, param_grid=lr_param_grid, cv=5, scoring='accuracy')
svm_grid_search = GridSearchCV(svm_model, param_grid=svm_param_grid, cv=5, scoring='accuracy')

In [20]:
# Fit models with hyperparameter tuning on BOW features
lr_grid_search.fit(X_train_bow, train_data['Score'])
svm_grid_search.fit(X_train_bow, train_data['Score'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [21]:
# Print best hyperparameters
print("Logistic Regression Best Hyperparameters for BOW:", lr_grid_search.best_params_)
print("Support Vector Machine Best Hyperparameters for BOW:", svm_grid_search.best_params_)


Logistic Regression Best Hyperparameters for BOW: {'C': 0.1}
Support Vector Machine Best Hyperparameters for BOW: {'C': 0.1}


In [22]:
# Predictions on the test set for BOW
y_pred_lr_bow = lr_grid_search.predict(X_test_bow)
y_pred_svm_bow = svm_grid_search.predict(X_test_bow)

In [23]:
# Model Evaluation for BOW
accuracy_lr_bow = accuracy_score(test_data['Score'], y_pred_lr_bow)
conf_matrix_lr_bow = confusion_matrix(test_data['Score'], y_pred_lr_bow)

In [24]:
# Model Evaluation for BOW
accuracy_svm_bow = accuracy_score(test_data['Score'], y_pred_svm_bow)
conf_matrix_svm_bow = confusion_matrix(test_data['Score'], y_pred_svm_bow)

In [25]:
print("\nLogistic Regression Accuracy for BOW:", accuracy_lr_bow)
print("Logistic Regression Confusion Matrix for BOW:\n", conf_matrix_lr_bow)


Logistic Regression Accuracy for BOW: 0.6705
Logistic Regression Confusion Matrix for BOW:
 [[  89    9    3    6   89]
 [  22    9   12   12   40]
 [  18   10   22   32   84]
 [   4    6   20   64  219]
 [  13    5   20   35 1157]]


In [26]:
print("\nSupport Vector Machine Accuracy for BOW:", accuracy_svm_bow)
print("Support Vector Machine Confusion Matrix for BOW:\n", conf_matrix_svm_bow)



Support Vector Machine Accuracy for BOW: 0.653
Support Vector Machine Confusion Matrix for BOW:
 [[ 101   15    4    6   70]
 [  25   14   12   11   33]
 [  23   17   22   20   84]
 [   7   11   20   49  226]
 [  22   17   35   36 1120]]


In [27]:
# Fit models with hyperparameter tuning on Tf-Dif features
lr_grid_search.fit(X_train_tfidf, train_data['Score'])
svm_grid_search.fit(X_train_tfidf, train_data['Score'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [28]:
# Print best hyperparameters
print("\nLogistic Regression Best Hyperparameters for n-grams:", lr_grid_search.best_params_)
print("Support Vector Machine Best Hyperparameters for n-grams:", svm_grid_search.best_params_)


Logistic Regression Best Hyperparameters for n-grams: {'C': 1}
Support Vector Machine Best Hyperparameters for n-grams: {'C': 1}


In [29]:
# Predictions on the test set for tdif
y_pred_lr_tfdif = lr_grid_search.predict(X_test_tfidf)
y_pred_svm_tfdif = svm_grid_search.predict(X_test_tfidf)

In [30]:
# Model Evaluation for tdif
accuracy_lr_tfdif = accuracy_score(test_data['Score'], y_pred_lr_tfdif)
conf_matrix_lr_tfdif = confusion_matrix(test_data['Score'], y_pred_lr_tfdif)

In [31]:
# Model Evaluation for tdif
accuracy_svm_tfdif = accuracy_score(test_data['Score'], y_pred_svm_tfdif)
conf_matrix_svm_tfdif = confusion_matrix(test_data['Score'], y_pred_svm_tfdif)

In [33]:
print("\nLogistic Regression Accuracy for TDIF:", accuracy_lr_tfdif)
print("Logistic Regression Confusion Matrix for TDIF:\n", conf_matrix_lr_tfdif)


Logistic Regression Accuracy for TDIF: 0.6655
Logistic Regression Confusion Matrix for TDIF:
 [[  69    4    0    4  119]
 [  13    2    4   13   63]
 [  11    0    7   29  119]
 [   1    1    7   44  260]
 [   4    0    4   13 1209]]


In [41]:
print("\nSVM Accuracy for TDIF:", accuracy_svm_tfdif)
print("SVM Confusion Matrix for TDIF:\n", conf_matrix_svm_tfdif)


SVM Accuracy for TDIF: 0.6765
SVM Confusion Matrix for TDIF:
 [[ 100    3    2    3   88]
 [  21    2    7    7   58]
 [  16    5   12   17  116]
 [   4    2    5   34  268]
 [   7    0    3   15 1205]]


In [38]:
# Fit models with hyperparameter tuning on n-gram features
lr_grid_search.fit(X_train_ngram, train_data['Score'])
svm_grid_search.fit(X_train_ngram, train_data['Score'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [39]:
# Print best hyperparameters
print("\nLogistic Regression Best Hyperparameters for n-grams:", lr_grid_search.best_params_)
print("Support Vector Machine Best Hyperparameters for n-grams:", svm_grid_search.best_params_)


Logistic Regression Best Hyperparameters for n-grams: {'C': 1}
Support Vector Machine Best Hyperparameters for n-grams: {'C': 0.1}


In [40]:
# Predictions on the test set for n-grams
y_pred_lr_ngram = lr_grid_search.predict(X_test_ngram)
y_pred_svm_ngram = svm_grid_search.predict(X_test_ngram)


In [42]:
# Model Evaluation for n-grams
accuracy_lr_ngram = accuracy_score(test_data['Score'], y_pred_lr_ngram)
conf_matrix_lr_ngram = confusion_matrix(test_data['Score'], y_pred_lr_ngram)

In [43]:
accuracy_svm_ngram = accuracy_score(test_data['Score'], y_pred_svm_ngram)
conf_matrix_svm_ngram = confusion_matrix(test_data['Score'], y_pred_svm_ngram)

In [46]:
print("\nLR Accuracy for ngram:", accuracy_lr_ngram)
print("LR Confusion Matrix for ngram:\n", conf_matrix_lr_ngram)


LR Accuracy for ngram: 0.6805
LR Confusion Matrix for ngram:
 [[  97   12    3    5   79]
 [  25   11   12   12   35]
 [  15    7   23   32   89]
 [   3    6   16   71  217]
 [  15    6   16   34 1159]]


In [45]:
print("\nSupport Vector Machine Accuracy for ngram:", accuracy_svm_ngram)
print("Support Vector Machine Confusion Matrix for ngram:\n", conf_matrix_svm_ngram)


Support Vector Machine Accuracy for ngram: 0.6635
Support Vector Machine Confusion Matrix for ngram:
 [[ 100   15    3    3   75]
 [  24   16   12    9   34]
 [  20   16   23   26   81]
 [   5    7   23   63  215]
 [  21    8   27   49 1125]]
