In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from nltk.corpus import stopwords
import re

# Load dataset
try:
    data = pd.read_csv('/content/Amazon_Unlocked_Mobile.csv', low_memory=False)
except Exception as e:
    print(f"Error loading file: {e}")

# Drop missing values
data.dropna(inplace=True)

## Data cleaning

In [None]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Define your cleanText function
def cleanText(raw_text, remove_stopwords=False, stemming=False, split_text=False):
    '''
    Convert a raw review to a cleaned review
    '''
    # Remove HTML
    text = BeautifulSoup(raw_text, 'lxml').get_text()

    # Remove non-alphabet characters
    letters_only = re.sub("[^a-zA-Z]", " ", text)

    # Convert to lower case and split into words
    words = letters_only.lower().split()

    # Optionally remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if w not in stops]

    # Optionally perform stemming
    if stemming:
        stemmer = SnowballStemmer('english')
        words = [stemmer.stem(w) for w in words]

    # Return list of words or joined text
    if split_text:
        return words

    return " ".join(words)

In [None]:
data['clean_reviews'] = data['Reviews'].apply(lambda x: cleanText(x, remove_stopwords=True, stemming=True))

# Feature Extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['clean_reviews'])

# Sentiment Labeling (Assuming reviews with Rating > 3 are positive)
data['sentiment'] = data['Rating'].apply(lambda x: 1 if x > 3 else 0)
y = data['sentiment']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training (Logistic Regression)
model = LogisticRegression()
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred))

  text = BeautifulSoup(raw_text, 'lxml').get_text()
  text = BeautifulSoup(raw_text, 'lxml').get_text()


Accuracy: 0.8978404570334699
Confusion Matrix: [[16706  4129]
 [ 2702 43329]]


In [None]:
# Predict Function for User Input
def predict_review(text):
    # Clean the input text using your custom cleanText function
    clean_input = cleanText(text, remove_stopwords=True, stemming=True)

    # Vectorize the cleaned input text using the pre-fitted TF-IDF vectorizer
    vectorized_input = tfidf.transform([clean_input])

    # Make the prediction using the trained model
    prediction = model.predict(vectorized_input)

    # Return the result as 'Positive' or 'Negative'
    return 'Positive' if prediction == 1 else 'Negative'

# Example
print(predict_review("Worst Phone ever. Not worth it"))

Negative


## Manual Testing

###Postive Reviews

In [None]:
print(predict_review("I love this phone! The battery life is amazing, and the screen is super clear."))
print(predict_review("Great camera quality and fast performance. Highly recommended!"))
print(predict_review("Best phone I've ever owned. Everything works perfectly and it feels premium."))
print(predict_review("The display is vibrant, and the speakers sound great. Definitely worth the price."))
print(predict_review("Excellent phone for the price, no complaints about its performance."))

Positive
Positive
Positive
Positive
Positive


### Negative Reviews

In [None]:
print(predict_review("Terrible battery life, and it keeps freezing. Not worth the money."))
print(predict_review("The phone overheats quickly and the camera quality is very poor."))
print(predict_review("I'm really disappointed. The phone lags constantly, and the display is not great."))
print(predict_review("I had high expectations, but it crashes frequently and the battery drains fast."))
print(predict_review("Not happy with this purchase. The phone feels cheap and slow."))

Negative
Negative
Negative
Negative
Negative


### Mixed Reviews

In [None]:
print(predict_review("The phone is okay, but the battery life could be better."))
print(predict_review("It's a decent phone for the price, but the camera isn't the best."))
print(predict_review("The screen is nice, but the phone gets hot sometimes."))
print(predict_review("The phone is fast, but the design is a little boring."))
print(predict_review("I like the phone overall, but the software is a bit buggy."))


Negative
Positive
Negative
Positive
Negative


## Save the model

In [None]:
import joblib

# Save the model
joblib.dump(model, '/content/drive/MyDrive/Mini Project Model Sem 7/sentiment_model_400k.pkl')

# Save the TF-IDF vectorizer
joblib.dump(tfidf, '/content/drive/MyDrive/Mini Project Model Sem 7/tfidf_vectorizer_400k.pkl')


['/content/drive/MyDrive/Mini Project Model Sem 7/tfidf_vectorizer_400k.pkl']