In [46]:
import pandas as pd
from textblob import TextBlob
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('freviews.csv')

In [3]:
df.shape

(55, 3)

In [4]:
df.head()

Unnamed: 0,title,rating,review
0,M B motors ballymena,5,I recently bought a car that was no se to me a...
1,This amazing,5,This amazing
2,They made everything so easy and…,5,They made everything so easy and answered all ...
3,Easy to buy and wonderful experience,5,Easy to buy and wonderful experience
4,Very nice people and very fast,5,Very nice people and very fast


In [5]:
df['rating'].value_counts()

rating
5    49
4     3
1     2
2     1
Name: count, dtype: int64

In [6]:
df['rating_sentiment'] = df['rating'].map({1: 'negative', 2: 'negative', 3: 'neutral', 4: 'positive', 5: 'positive'})

In [7]:
df.head()

Unnamed: 0,title,rating,review,rating_sentiment
0,M B motors ballymena,5,I recently bought a car that was no se to me a...,positive
1,This amazing,5,This amazing,positive
2,They made everything so easy and…,5,They made everything so easy and answered all ...,positive
3,Easy to buy and wonderful experience,5,Easy to buy and wonderful experience,positive
4,Very nice people and very fast,5,Very nice people and very fast,positive


In [9]:
df['blob'] = df['review'].apply(lambda review: TextBlob(review).sentiment.polarity)

In [10]:
df.head()

Unnamed: 0,title,rating,review,rating_sentiment,blob
0,M B motors ballymena,5,I recently bought a car that was no se to me a...,positive,0.1
1,This amazing,5,This amazing,positive,0.6
2,They made everything so easy and…,5,They made everything so easy and answered all ...,positive,0.716667
3,Easy to buy and wonderful experience,5,Easy to buy and wonderful experience,positive,0.716667
4,Very nice people and very fast,5,Very nice people and very fast,positive,0.52


In [12]:
sia = SentimentIntensityAnalyzer()

In [13]:
df['nltk'] = df['review'].apply(lambda review: sia.polarity_scores(review)['compound'])

In [14]:
df.head()

Unnamed: 0,title,rating,review,rating_sentiment,blob,nltk
0,M B motors ballymena,5,I recently bought a car that was no se to me a...,positive,0.1,0.8689
1,This amazing,5,This amazing,positive,0.6,0.5859
2,They made everything so easy and…,5,They made everything so easy and answered all ...,positive,0.716667,0.8287
3,Easy to buy and wonderful experience,5,Easy to buy and wonderful experience,positive,0.716667,0.765
4,Very nice people and very fast,5,Very nice people and very fast,positive,0.52,0.4754


In [15]:
def rate_to_sentiment(value):
    if value >= 0.05:
        return "positive"
    elif value <= -0.05:
        return "negative"
    else:
        return "neutral"

In [16]:
df['blob_sentiment'] = df['blob'].apply(rate_to_sentiment)

In [17]:
df.head()

Unnamed: 0,title,rating,review,rating_sentiment,blob,nltk,blob_sentiment
0,M B motors ballymena,5,I recently bought a car that was no se to me a...,positive,0.1,0.8689,positive
1,This amazing,5,This amazing,positive,0.6,0.5859,positive
2,They made everything so easy and…,5,They made everything so easy and answered all ...,positive,0.716667,0.8287,positive
3,Easy to buy and wonderful experience,5,Easy to buy and wonderful experience,positive,0.716667,0.765,positive
4,Very nice people and very fast,5,Very nice people and very fast,positive,0.52,0.4754,positive


In [18]:
df['nltk_sentiment'] = df['nltk'].apply(rate_to_sentiment)

In [19]:
df.head()

Unnamed: 0,title,rating,review,rating_sentiment,blob,nltk,blob_sentiment,nltk_sentiment
0,M B motors ballymena,5,I recently bought a car that was no se to me a...,positive,0.1,0.8689,positive,positive
1,This amazing,5,This amazing,positive,0.6,0.5859,positive,positive
2,They made everything so easy and…,5,They made everything so easy and answered all ...,positive,0.716667,0.8287,positive,positive
3,Easy to buy and wonderful experience,5,Easy to buy and wonderful experience,positive,0.716667,0.765,positive,positive
4,Very nice people and very fast,5,Very nice people and very fast,positive,0.52,0.4754,positive,positive


In [20]:
blob_accuracy = (df['rating_sentiment'] == df['blob_sentiment']).mean()
blob_accuracy

0.7818181818181819

In [21]:
nltk_accuracy = (df['rating_sentiment'] == df['nltk_sentiment']).mean()
nltk_accuracy

0.8909090909090909

In [23]:
df['rating_sentiment'].value_counts()

rating_sentiment
positive    52
negative     3
Name: count, dtype: int64

In [24]:
df['blob_sentiment'].value_counts()

blob_sentiment
positive    44
neutral      9
negative     2
Name: count, dtype: int64

In [25]:
df['nltk_sentiment'].value_counts()

nltk_sentiment
positive    49
neutral      4
negative     2
Name: count, dtype: int64

In [26]:
df.shape

(55, 8)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             55 non-null     object 
 1   rating            55 non-null     int64  
 2   review            55 non-null     object 
 3   rating_sentiment  55 non-null     object 
 4   blob              55 non-null     float64
 5   nltk              55 non-null     float64
 6   blob_sentiment    55 non-null     object 
 7   nltk_sentiment    55 non-null     object 
dtypes: float64(2), int64(1), object(5)
memory usage: 3.6+ KB


In [28]:
# MACHINE LEARNING

In [57]:
vectorizer = TfidfVectorizer(max_features=200, ngram_range=(1, 2))

In [58]:
X = vectorizer.fit_transform(df['review'])

In [59]:
y = df['rating_sentiment']

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:
model = RandomForestClassifier()

In [62]:
model.fit(X_train, y_train)

In [63]:
pred = model.predict(X_test)

In [64]:
accuracy = accuracy_score(y_test, pred)

In [65]:
accuracy

0.8181818181818182

In [66]:
model = LogisticRegression(max_iter=1000, random_state=42)

In [67]:
model.fit(X_train, y_train)

In [68]:
pred = model.predict(X_test)

In [69]:
accuracy = accuracy_score(y_test, pred)
accuracy

0.8181818181818182

In [70]:
y_test

31    positive
5     positive
32    negative
13    positive
19    positive
49    negative
41    positive
26    positive
43    positive
12    positive
52    positive
Name: rating_sentiment, dtype: object

In [71]:
pred

array(['positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive'], dtype=object)