In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Stopwords-Removal
* Get list of commonly used stop words from NLTK library
* Remove commonly used stop words from text documents
* Remove custom stop words from text documents

In [None]:
import re, nltk, pandas as pd, numpy as np
from nltk.tokenize import RegexpTokenizer

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

***Common stopwords from nltk***


In [None]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords[:50]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be']

In [None]:
len(stopwords)

179

In [None]:
tokenizer = RegexpTokenizer('\w+')

In [None]:
doc = 'I visited my grandparents last week; We had a good time together'
tokens = tokenizer.tokenize(doc.lower())
tokens

['i',
 'visited',
 'my',
 'grandparents',
 'last',
 'week',
 'we',
 'had',
 'a',
 'good',
 'time',
 'together']

In [None]:
stop_list = [x for x in tokens if x not in stopwords]
stop_list

['visited', 'grandparents', 'last', 'week', 'good', 'time', 'together']

All the common words got ignored and only important words are considered

In [None]:
docs = ['I visited my grandparents last week; We had a good time together', 'nlp engineers spend most of their time on text cleaning'] # list of lists

doc_list = []

for x in docs:
  toks = tokenizer.tokenize(x.lower())
  stops = [y for y in toks if y not in stopwords] 
  doc_list.append(stops)
doc_list

[['visited', 'grandparents', 'last', 'week', 'good', 'time', 'together'],
 ['nlp', 'engineers', 'spend', 'time', 'text', 'cleaning']]

Here we got a list of lists with imp words only 

### Combine elements in a single string

In [None]:
docs = ['I visited my grandparents last week; We had a good time together', 'nlp engineers spend most of their time on text cleaning'] # list of lists

docs_list = []

for x in docs:
  toks1 = tokenizer.tokenize(x.lower())
  stops = [y for y in toks1 if y not in stopwords] 
  docs_list.append(' '.join(stops)) # joined the tokens in a single doc with space
docs_list

['visited grandparents last week good time together',
 'nlp engineers spend time text cleaning']

* All the tokens got combined as a single string
* Can also be seen as combined and cleaned doc

# Removing stopwords from text column

In [None]:
downloaded = drive.CreateFile({'id':'12CUjW29tTTxYAcPhxuKb_qSn0UTzc4BR'}) # replace the id with id of file you want to access
downloaded.GetContentFile('imdb_sentiment.csv') 

In [None]:
data = pd.read_csv('imdb_sentiment.csv')
data.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [None]:
doc_list2 = []

for x in data['review']:
  toks = tokenizer.tokenize(x.lower())
  stops = [y for y in toks if y not in stopwords]
  doc_list2.append(' '.join(stops))
doc_list2[:10]

['slow moving aimless movie distressed drifting young man',
 'sure lost flat characters audience nearly half walked',
 'attempting artiness black white clever camera angles movie disappointed became even ridiculous acting poor plot lines almost non existent',
 'little music anything speak',
 'best scene movie gerardo trying find song keeps running head',
 'rest movie lacks art charm meaning emptiness works guess empty',
 'wasted two hours',
 'saw movie today thought good effort good messages kids',
 'bit predictable',
 'loved casting jimmy buffet science teacher']

# Removing custom stopwords
* Apart from standard stopwords of 179 words, we can include custom stopwords in the list.   


In [None]:
custom_stopwords = ['movie', 'little']
all_stopwords = np.hstack([stopwords, custom_stopwords])  #using fn horizontal stack to join two lists into a single list

doc_list3 = []
print(len(all_stopwords))
for x in data['review']:
  toks = tokenizer.tokenize(x.lower())
  stops = [y for y in toks if y not in all_stopwords]
  doc_list3.append(' '.join(stops))

doc_list3[:10]

181


['slow moving aimless distressed drifting young man',
 'sure lost flat characters audience nearly half walked',
 'attempting artiness black white clever camera angles disappointed became even ridiculous acting poor plot lines almost non existent',
 'music anything speak',
 'best scene gerardo trying find song keeps running head',
 'rest lacks art charm meaning emptiness works guess empty',
 'wasted two hours',
 'saw today thought good effort good messages kids',
 'bit predictable',
 'loved casting jimmy buffet science teacher']