In [None]:
# https://medium.com/@robdelacruz/sentiment-analysis-using-natural-language-processing-nlp-3c12b77a73ec prefer

In [None]:
# The Amazon reviews dataset is constructed by taking review scores 1 and 2 as negative and 4 and 5
# as positive. Samples of score 3 is ignored. In the dataset, class 1 is the negative, and class
#  2 is the positive. Each class has 1,800,000 training samples and 200,000 testing samples.

In [None]:
# The Amazon reviews dataset is constructed by taking review scores 1 and 2 as negative and 4 and
# 5 as positive. Samples of score 3 is ignored. In the dataset, class 1 is the negative, and class 2 is the positive. Each class
#  has 1,800,000 training samples and 200,000 testing samples.

In [None]:
# Preprocessing
# To prepare our data for model training, we need to convert our text data into features that our model will use to train and cast future predictions. We’ll use two preprocessing steps:

# Count vectorizing text
# tf-idf

In [None]:
# The purpose of using tf-idf instead of simply counting the frequency of a token in a document is to reduce the
# influence of tokens that appear very frequently in a given collection of documents. These tokens are less informative
#  than those appearing in only a small fraction of the corpus. Scaling down the impact of these frequently occurring tokens helps improve
# text-based machine-learning models’ accuracy.

In [None]:
# Classification algorithm
# For this project, we will use the logistic regression algorithm to discriminate between positive and negative reviews.
#  Logistic regression is a statistical method used for binary classification, which means it’s designed to predict the
#   probability of a categorical outcome
#  with two possible values.

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import pandas as pd
import re

data = pd.read_csv('/content/amazon_data_review.csv',names=['sentiment', 'title', 'review'])

# Access the corpus and target variables
X = data.review
# Convert sentiment to string labels BEFORE handling missing values
y = data.sentiment.replace({1:'Negative', 2:'Positive', 3: 'Neutral', 4: 'Positive', 5: 'Positive'})

# Handle missing values in 'review' column
X = X.fillna('')  # Fill missing values with empty string

# Handle missing values in 'sentiment' column before train-test split
data.dropna(subset=['sentiment'], inplace=True)

# Access the corpus and target variables again, after dropping NaNs
X = data.review
# No need to replace sentiment values again, it's already done
y = data.sentiment

# train test splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# We'll use this function to replace numbers from the string
preprocessor = lambda text: re.sub(r'[^a-z ]', '', text.lower())

# construct the pipeline with the procedural steps to
# process the data and cast predictions
pipe = Pipeline([
  ('vec', CountVectorizer(stop_words='english', min_df=10, preprocessor=preprocessor)),  # Lower min_df
  ('tfid', TfidfTransformer()),
  ('lr', SGDClassifier(loss='log'))
])

# fit the model to the data
model = pipe.fit(X_train, y_train)



In [None]:
# predict sentiment on the test data frame
y_test_pred = model.predict(X_test)

# create the classification report
report = classification_report(y_test, y_test_pred)
print(report)


              precision    recall  f1-score   support

         1.0       0.12      0.20      0.15         5
         2.0       0.00      0.00      0.00         2
         3.0       0.17      0.22      0.19         9
         4.0       0.43      0.23      0.30        13
         5.0       0.27      0.27      0.27        11

    accuracy                           0.23        40
   macro avg       0.20      0.19      0.18        40
weighted avg       0.27      0.23      0.23        40



In [None]:
test = {
  'This gadget is awesome':'Positive',
  'This gadget is terrible':'Negative',
  'This gadget':'Neutral'
}

predictions = [[text, expected, model.predict([text])[0]] for text, expected in test.items()]
pd.DataFrame(
  predictions,
  columns=['Test case', 'Expected', 'Prediction']
)

Unnamed: 0,Test case,Expected,Prediction
0,This gadget is awesome,Positive,3.0
1,This gadget is terrible,Negative,3.0
2,This gadget,Neutral,3.0


In [None]:
# output is not favourable cause of small data set , i have taken only 200 line from data