In [None]:
import pandas as pd, spacy, nltk, re

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
doc = 'I visited my grandparents last week; We had a good time together'

# Manual Process


In [None]:
# use lower and split the doc
tokens = doc.lower().split()
tokens

['i',
 'visited',
 'my',
 'grandparents',
 'last',
 'week;',
 'we',
 'had',
 'a',
 'good',
 'time',
 'together']

In [None]:
# Remove characters
doc_cleaned = re.sub('[^\w\s]','',doc.lower()) # removing semi-colon
doc_cleaned

'i visited my grandparents last week we had a good time together'

In [None]:
# split the cleaned doc 
tokens = doc_cleaned.split(' ')
tokens

['i',
 'visited',
 'my',
 'grandparents',
 'last',
 'week',
 'we',
 'had',
 'a',
 'good',
 'time',
 'together']

# Automated process of tokenization

In [None]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(doc.lower())
tokens

['i',
 'visited',
 'my',
 'grandparents',
 'last',
 'week',
 ';',
 'we',
 'had',
 'a',
 'good',
 'time',
 'together']

In the above list ';' came as a seperate token compared to manual process

# **RegexpTokenizer** - If we want to apply regular expressions and then want to extract the token

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+') # setting the pattern to make tokens of only words and nothing else
tokens = tokenizer.tokenize(doc.lower()) 
tokens

['i',
 'visited',
 'my',
 'grandparents',
 'last',
 'week',
 'we',
 'had',
 'a',
 'good',
 'time',
 'together']

In [None]:
doc2 = '@john This product is really cool!!!😀😃😄😁😆😅 #awesome'

In [None]:
tokens = word_tokenize(doc2)
tokens

['@',
 'john',
 'This',
 'product',
 'is',
 'really',
 'cool',
 '!',
 '!',
 '!',
 '😀😃😄😁😆😅',
 '#',
 'awesome']

* @john, #awesome should have come together but
* All the tokens except pair of smileys are coming in seperate lines

In [None]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()
token1 = tweet_tokenizer.tokenize(doc2)
token1

['@john',
 'This',
 'product',
 'is',
 'really',
 'cool',
 '!',
 '!',
 '!',
 '😀',
 '😃',
 '😄',
 '😁',
 '😆',
 '😅',
 '#awesome']

* Now @john and #awesome are coming together as they should have and smileys are coming in seperate lines 

# Tokenization - CSV file read

[link text](https://)

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
downloaded = drive.CreateFile({'id':'12CUjW29tTTxYAcPhxuKb_qSn0UTzc4BR'}) # replace the id with id of file you want to access
downloaded.GetContentFile('imdb_sentiment.csv') 

In [None]:
import pandas as pd
data = pd.read_csv('imdb_sentiment.csv')
data.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [None]:
docs = data['review'].str.lower()
tokenizer = RegexpTokenizer('\w+')
for x in docs.head():
  tokens = tokenizer.tokenize(x)
  print(x)
  print(tokens)
  print('-'*50)

a very, very, very slow-moving, aimless movie about a distressed, drifting young man.  
['a', 'very', 'very', 'very', 'slow', 'moving', 'aimless', 'movie', 'about', 'a', 'distressed', 'drifting', 'young', 'man']
--------------------------------------------------
not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  
['not', 'sure', 'who', 'was', 'more', 'lost', 'the', 'flat', 'characters', 'or', 'the', 'audience', 'nearly', 'half', 'of', 'whom', 'walked', 'out']
--------------------------------------------------
attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  
['attempting', 'artiness', 'with', 'black', 'white', 'and', 'clever', 'camera', 'angles', 'the', 'movie', 'disappointed', 'became', 'even', 'more', 'ridiculous', 'as', 'the', 'acting', 'was', 'poor', 'and', 'the', 'plot', 'and', 'lines', 'almost', 

In [None]:
docs = data['review'].str.lower()
docs_cleaned = []
tokenizer = RegexpTokenizer('\w+')
for x in docs.head():
  tokens = tokenizer.tokenize(x)
  docs_cleaned.append(tokens)
docs_cleaned

[['a',
  'very',
  'very',
  'very',
  'slow',
  'moving',
  'aimless',
  'movie',
  'about',
  'a',
  'distressed',
  'drifting',
  'young',
  'man'],
 ['not',
  'sure',
  'who',
  'was',
  'more',
  'lost',
  'the',
  'flat',
  'characters',
  'or',
  'the',
  'audience',
  'nearly',
  'half',
  'of',
  'whom',
  'walked',
  'out'],
 ['attempting',
  'artiness',
  'with',
  'black',
  'white',
  'and',
  'clever',
  'camera',
  'angles',
  'the',
  'movie',
  'disappointed',
  'became',
  'even',
  'more',
  'ridiculous',
  'as',
  'the',
  'acting',
  'was',
  'poor',
  'and',
  'the',
  'plot',
  'and',
  'lines',
  'almost',
  'non',
  'existent'],
 ['very', 'little', 'music', 'or', 'anything', 'to', 'speak', 'of'],
 ['the',
  'best',
  'scene',
  'in',
  'the',
  'movie',
  'was',
  'when',
  'gerardo',
  'is',
  'trying',
  'to',
  'find',
  'a',
  'song',
  'that',
  'keeps',
  'running',
  'through',
  'his',
  'head']]

* Here we have list of lists which contains list elements in rows for e.g.-> 1st row in docs became 1st list, 2nd row in docs became 2nd list 

# Use spacy to get individual tokens

In [None]:
nw_doc = 'I visited my grandparents last week; We had a good time together'

import spacy
nlp = spacy.load('en_core_web_sm') # Necessary corpus required to do text cleaning and processing operations 

spacy_doc = nlp(nw_doc.lower())

for x in spacy_doc:
  print(x)

i
visited
my
grandparents
last
week
;
we
had
a
good
time
together


* Here we got individual tokens automatically from a single document 