# Download the package for nlp

In [1]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloadin

True

# Try NLP Libraries

### **word tokenizers**

In [7]:
from nltk.tokenize import word_tokenize
txt = 'iam dana a fullstack developer and ai engineer'
nlp = word_tokenize(txt)
print(nlp)

['iam', 'dana', 'a', 'fullstack', 'developer', 'and', 'ai', 'engineer']


### **Stop words**

In [6]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
txt = 'iam dana a fullstack developer and ai engineer.'
sw = set(stopwords.words('english'))
# print(sw)
words = word_tokenize(txt)
new = []
for w in words:
  if w not in sw:
    new.append(w)

print(new)

['iam', 'dana', 'fullstack', 'developer', 'ai', 'engineer', '.']


### **Stemming**

In [9]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
txt = ['walk', 'walking', 'walked']

for w in txt:
  print(ps.stem(w))

walk
walk
walk


# working on project

In [13]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pandas as pd

In [15]:
data = pd.read_csv('/content/Restaurant_Reviews.tsv', delimiter='\t', quoting=1)
# print(data)
data.head()


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [18]:
filters = set(stopwords.words('english'))
ps = PorterStemmer()


In [19]:
print(len(data))

1000


### **NLP**

In [29]:
reviews = []
for i in range (len(data)):
  cleaned = re.sub('^[a-zA-Z]','',data['Review'][i])
  cleaned = cleaned.lower()
  cleaned = word_tokenize(cleaned)
  cleaned = [ps.stem(word) for word in cleaned if word not in filters]
  cleaned = " ".join(cleaned)
  reviews.append(cleaned)
print(reviews)

['ow ... love place .', 'rust good .', 'ot tasti textur nasti .', 'top late may bank holiday rick steve recommend love .', 'select menu great price .', 'ow get angri want damn pho .', "oneslti n't tast fresh . )", 'potato like rubber could tell made ahead time kept warmer .', 'fri great .', 'great touch .', 'ervic prompt .', 'ould go back .', 'cashier care ever say still end wayyy overpr .', 'tri cape cod ravoli , chicken , cranberri ... mmmm !', 'disgust pretti sure human hair .', 'shock sign indic cash .', 'ighli recommend .', 'aitress littl slow servic .', 'place worth time , let alon vega .', 'id like .', 'burritto blah !', 'food , amaz .', 'ervic also cute .', 'could care less ... interior beauti .', 'perform .', "hat 's right ... .the red velvet cake ... ..ohhh stuff good .", '- never brought salad ask .', 'hole wall great mexican street taco , friendli staff .', 'ook hour get food 4 tabl restaur food luke warm , sever run around like total overwhelm .', 'worst salmon sashimi .',

### **Create The Victor Representation**

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x = cv.fit_transform(reviews)

In [31]:
y = data.iloc[:,1:2]
y

Unnamed: 0,Liked
0,1
1,0
2,0
3,1
4,1
...,...
995,0
996,0
997,0
998,0


### **Splitting data**

In [33]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

### **Choosing a Classifier**

In [34]:
from xgboost import XGBClassifier
model = XGBClassifier()

In [35]:
model.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier()

### **accuracy**

In [37]:
score = model.score(x_test,y_test)
print(score)

0.725


### **Deployment**

In [46]:
your_review = input('enter your review: ')
l1 = []

enter your review: that's very good


In [47]:
cleaned2 = re.sub('^[a-zA-Z]','',your_review)
cleaned2 = cleaned2.lower()
cleaned2 = word_tokenize(cleaned2)
cleaned2 = [ps.stem(word) for word in cleaned2 if word not in filters]
cleaned2 = " ".join(cleaned2)
l1.append(cleaned2)
print(l1)

["hat 's good"]


In [48]:
x_new = cv.transform(l1)

In [49]:
sent = model.predict(x_new)

In [50]:
print(sent)

[1]


In [51]:
if 0 in sent:
  print('bad review')
else:
  print('nice review')

nice review
