<a href="https://colab.research.google.com/github/chandan110791/NLP/blob/main/POS_Tagging_spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing libraries

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
import spacy 
nlp = spacy.load("en_core_web_sm",disable=['parser','ner'])

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')



Mounted at /content/gdrive


In [None]:
!cp '/content/gdrive/MyDrive/ML/DataSets/Samsung.txt' Samsung.txt

### Read reviews data

In [None]:
con=open("Samsung.txt",'r', encoding="utf-8")
samsung_reviews=con.read()
con.close()

In [None]:
len(samsung_reviews.split("\n"))

46355

### Dataset is a text file where each review is in a new line

In [None]:
samsung_reviews.split("\n")[0:4]

["I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!",
 'nice phone, nice up grade from my pantach revue. Very clean set up and easy set up. never had an android phone but they are fantastic to say the least. perfect size for surfing and social media. great phone samsung',
 'Very pleased',
 'It works good but it goes slow sometimes but its a very good phone I love it']

### Will our hypothesis hold on real world data? `Product features---POS_NOUN`

In [None]:
review1=samsung_reviews.split("\n")[0]
review1=nlp(review1)

### Lets do nlp parse on part of one review in our dataset

In [None]:
for tok in review1[0:10]:
    print(tok.text,"---",tok.lemma_,"---",tok.pos_)

I --- I --- PRON
feel --- feel --- VERB
so --- so --- SCONJ
LUCKY --- lucky --- NOUN
to --- to --- PART
have --- have --- AUX
found --- find --- VERB
this --- this --- PRON
used --- use --- VERB
( --- ( --- PUNCT


#### Real world data is usually messy, observe the words `found` and `used`

In [None]:
pos = []
lemma = []
text = []
for tok in review1:
    pos.append(tok.pos_)
    lemma.append(tok.lemma_)
    text.append(tok.text)

In [None]:
nlp_table = pd.DataFrame({'text':text,'lemma':lemma,'pos':pos})
nlp_table.head()

Unnamed: 0,text,lemma,pos
0,I,I,PRON
1,feel,feel,VERB
2,so,so,SCONJ
3,LUCKY,lucky,NOUN
4,to,to,PART


In [None]:
## Get most frequent lemma forms of nouns
nlp_table[nlp_table['pos']=='NOUN']['lemma'].value_counts()

phone      3
lucky      1
line       1
one        1
year       1
upgrade    1
honesty    1
seller     1
Name: lemma, dtype: int64

#### It seems possible that if we extract all the nouns from the reviews and look at the top 5 most frequent lemmatised noun forms, we will be able to identify `What people are talking about?`

### Lets repeat this experiment on a larger set of reviews

In [None]:
nouns = []
for review in samsung_reviews.split("\n")[0:1000]:
    doc = nlp(review)
    for tok in doc:
        if tok.pos_=="NOUN":
            nouns.append(tok.lemma_.lower())

### Lets add some way of keeping track of time

In [None]:
from tqdm import tqdm
nouns = []
for review in tqdm(samsung_reviews.split("\n")[0:1000]):
    doc = nlp(review)
    for tok in doc:
        if tok.pos_=="NOUN":
            nouns.append(tok.lemma_.lower())
pd.Series(nouns).value_counts().head(5)

100%|██████████| 1000/1000 [00:11<00:00, 88.14it/s]


phone      1217
battery      92
time         92
screen       86
price        86
dtype: int64

In [None]:
len(samsung_reviews.split("\n"))

46355

### Did you notice anything? What do you think will be the time taken to process all the reviews?

In [None]:
(46355//1000)*17

782

In [None]:
782//60

13

## Summary
- POS tag based rule seems to be working well
- We need to figure out a way to reduce the time taken to process reviews

In [None]:
from tqdm import tqdm
nouns = []
for review in tqdm(samsung_reviews.split("\n")[0:len(samsung_reviews.split("\n"))]):
    doc = nlp(review)
    for tok in doc:
        if tok.pos_=="NOUN":
            nouns.append(tok.lemma_.lower())

100%|██████████| 46355/46355 [03:25<00:00, 225.24it/s]


phone      43651
battery     4362
product     3912
time        3828
screen      3816
dtype: int64


Summary:

    Most frequently used lemmatised forms of noun, inform us about the product features people are talking about in product reviews
    In order to process the review data faster spacy allows us to use the idea of enabling parts of model inference pipeline via spacy.loads() command and disable parameter



In [None]:
noundPD = pd.Series(nouns)

In [None]:
noundPD.value_counts().head(100)

phone         43651
battery        4362
product        3912
time           3828
screen         3816
              ...  
number          491
experience      491
pocket          474
company         469
edge            466
Length: 100, dtype: int64

In [None]:
import re

In [None]:
pattern = re.compile("\w+\sbattery\s\w+")

In [None]:
s1 = "The battery was great"

In [None]:
re.findall(pattern,s1)

['The battery was']

In [None]:
re.findall(pattern,s1)[0]

'The battery was'

In [None]:
re.findall(pattern,s1)[0].split(" ")

['The', 'battery', 'w']

In [None]:
prexis_suffixes = re.findall(pattern,samsung_reviews)

In [None]:
prexis_suffixes[0].split(" ")

['that', 'battery', 'l']

In [None]:
prexis_suffixes[0].split(" ")[0]

'that'

In [None]:
prexis_suffixes[0].split(" ")[-1]

'life'

In [None]:
prefixes = []
suffixes = []
for p in prexis_suffixes:
  l = p.split(" ")
  prefixes.append(l[0].lower())
  suffixes.append(l[-1].lower())

In [None]:
prefixes=pd.Series(prefixes).value_counts().head(5).index
suffixes=pd.Series(suffixes).value_counts().head(5).index

In [None]:
pd.DataFrame({'prefixes':prefixes,'keyword':['battery']*len(prefixes),'suffixes':suffixes})

Unnamed: 0,prefixes,keyword,suffixes
0,the,battery,life
1,good,battery,is
2,great,battery,and
3,and,battery,lasts
4,long,battery,was


In [None]:
stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
prefixes = [p for p in prefixes if p not in stop_words]
suffixes = [s for s in suffixes if s not in stop_words]
prefixes=pd.Series(prefixes).value_counts().head(5).index
suffixes=pd.Series(suffixes).value_counts().head(5).index
pd.DataFrame({'prefixes':prefixes,'keyword':['battery']*len(prefixes),'suffixes':suffixes})

ValueError: ignored

In [None]:
pd.DataFrame({'prefixes':pref,'keyword':['battery']*len(pref),'suffixes':suff})

ValueError: ignored

In [None]:
def get_context(reviews,keyword):
  prefixes = []
  suffixes = []
  pattern = re.compile(f"\w+\s{keyword}\s\w+")
  prexis_suffixes = re.findall(pattern,reviews)

  for p in prexis_suffixes:
    l = p.split(" ")
    prefixes.append(l[0].lower())
    suffixes.append(l[-1].lower())
  prefixes = [p for p in prefixes if p not in stop_words]
  suffixes = [s for s in suffixes if s not in stop_words]
  prefix_return=pd.Series(prefixes).value_counts().head(5).index
  suffix_return=pd.Series(suffixes).value_counts().head(5).index

  return pd.DataFrame({'prefixes':prefix_return,'keyword':[f'{keyword}']*len(prefix_return),'suffixes':suffix_return})

In [None]:
get_context(samsung_reviews,"battery")

Unnamed: 0,prefixes,keyword,suffixes
0,good,battery,life
1,great,battery,lasts
2,long,battery,last
3,new,battery,doesn
4,removable,battery,runs


In [None]:
get_context(samsung_reviews,"screen")

Unnamed: 0,prefixes,keyword,suffixes
0,touch,screen,protector
1,big,screen,size
2,great,screen,resolution
3,large,screen,protectors
4,home,screen,quality
