<a href="https://colab.research.google.com/github/devanshmesson/Data-Science/blob/master/Task5_AlphaAI_Devansh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Inputting the dataset

In [3]:
import pandas as pd
data=pd.read_csv("Yelpdata.csv")
pd.options.mode.chained_assignment = None  # default='warn'

##Printing the dataset

In [4]:
data

Unnamed: 0,Rating,Review
0,1,I got 'new' tires from them and within two wee...
1,1,Don't waste your time. We had two different p...
2,1,All I can say is the worst! We were the only 2...
3,1,I have been to this restaurant twice and was d...
4,1,Food was NOT GOOD at all! My husband & I ate h...
...,...,...
194,5,Had the best burger of my life at Tessaro's ov...
195,5,Friendly Service. Fresh Veggies! At only 1 dol...
196,5,"I only come to Pittsburgh twice a year, but I ..."
197,5,Yes the pizza is the best in the burgh. But d...


##Simplifying the dataset
####Considering Ratings greater than 4 as Positive
####Considering Ratings less than 4 as Negative


In [5]:
c=0
for i in range(len(data)):
  if(data['Rating'][i]>=4):
    #print(i)
    data['Rating'][i]=1
  else:
    data['Rating'][i]=0

##Number of Positive and Negative reviews in the dataset

In [6]:
data['Rating'].value_counts()

1    100
0     99
Name: Rating, dtype: int64

##Installing transformers library for loading DistilBert Model

##Loading the pre-trained DistilBERT model and tokenizer

In [7]:
!pip install transformers



##Loading pre-trained DistilBert Model and Tokenizer

In [8]:
import transformers
ModelClass=transformers.DistilBertModel
TokenizerClass=transformers.DistilBertTokenizer

model=ModelClass.from_pretrained('distilbert-base-uncased') 
tokenizer=TokenizerClass.from_pretrained('distilbert-base-uncased')

#model contains pre-trained DistilBert Model 
#model contains pre-trained DistilBert tokenizer for tokenizing the sentences

##Data Preprocessing

Tokenization  
Reviews must be tokenized and embedded in order to provide it to the DistilBert model

In [9]:
tokenized_reviews= data['Review'].apply((lambda review: tokenizer.encode(review, add_special_tokens=True, truncation=True)))
temp=tokenized_reviews

In [10]:
tokenized_reviews
#Each review is represented as a list of tokens

0      [101, 1045, 2288, 1005, 2047, 1005, 13310, 201...
1      [101, 2123, 1005, 1056, 5949, 2115, 2051, 1012...
2      [101, 2035, 1045, 2064, 2360, 2003, 1996, 5409...
3      [101, 1045, 2031, 2042, 2000, 2023, 4825, 3807...
4      [101, 2833, 2001, 2025, 2204, 2012, 2035, 999,...
                             ...                        
194    [101, 2018, 1996, 2190, 15890, 1997, 2026, 216...
195    [101, 5379, 2326, 1012, 4840, 2310, 13871, 311...
196    [101, 1045, 2069, 2272, 2000, 6278, 3807, 1037...
197    [101, 2748, 1996, 10733, 2003, 1996, 2190, 199...
198    [101, 2026, 6513, 2003, 2172, 10947, 2000, 199...
Name: Review, Length: 199, dtype: object

In [11]:
#Calculating maximum number of tokens among all reviews
max_num=0

for tokens in tokenized_reviews:
  max_num=max(max_num,len(tokens))

##Padding all the lists in "tokenized_reviews" of same size
Advantage: if DistilBERT processes the tokenized_reviews as one batch, it will be faster.So, the tokenized reviews must be converted to a 2D array for padding.

In [12]:
for index in range(len(tokenized_reviews)):
  padding_needed=max_num-len(tokenized_reviews[index])
  tokenized_reviews[index]=tokenized_reviews[index]+[0]*(padding_needed)

##Storing padded tokenized_reviews in a 2D array

In [13]:
import numpy as np
padded_reviews=np.array([tokens for tokens in tokenized_reviews.values])
padded_reviews

array([[ 101, 1045, 2288, ...,    0,    0,    0],
       [ 101, 2123, 1005, ...,    0,    0,    0],
       [ 101, 2035, 1045, ...,    0,    0,    0],
       ...,
       [ 101, 1045, 2069, ...,    0,    0,    0],
       [ 101, 2748, 1996, ...,    0,    0,    0],
       [ 101, 2026, 6513, ...,    0,    0,    0]])

##Creating an attention mask to avoid confusion due to padding
DistilBert must ignore the padding in the padded_reviews because it does not add any significance in the analysis

In [14]:
#if elements of padded_reviews does not contain 0, then element=1, otherwise element=0
attentionmask=np.where(padded_reviews != 0,1,0)

##Running the DistilBert Model

In [15]:
import torch 
inputIDs=torch.tensor(padded_reviews) #input to model
attentionmask=torch.tensor(attentionmask) #to ignore the padding

with torch.no_grad():
      Output_3D = model(inputIDs, attention_mask=attentionmask)

##Filtering the Output_3D for classification
Filtering only the first token of each sentence.This first token is the classification token [CLS].This token is added at the very beginning of every sentence 

In [16]:
feature_matrix = Output_3D[0][:,0,:].numpy()
Sentiment=data['Rating'] #Class

##Train/Test split of feature matrix for classification

In [19]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(feature_matrix,Sentiment)

##Using Naive Bayes for classification

In [56]:
from sklearn import naive_bayes
nb=naive_bayes.BernoulliNB()
nb.fit(x_train,y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

##Calculate accuracy

In [57]:
from sklearn.metrics import accuracy_score
prediction=nb.predict(x_test)
accuracy_score(prediction,y_test)

0.8