# Tokenization

In [1]:
import re
import nltk
import pandas as pd

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
doc = 'I visited my grandparents last week; We had a good time together'

In [5]:
tokens = doc.lower().split(' ')
tokens

['i',
 'visited',
 'my',
 'grandparents',
 'last',
 'week;',
 'we',
 'had',
 'a',
 'good',
 'time',
 'together']

In [6]:
doc_cleaned = re.sub('[^\w+\s]', '', doc.lower())
doc_cleaned

'i visited my grandparents last week we had a good time together'

In [7]:
doc_cleaned = re.sub('[^\w+\s]', '', doc.lower())
tokens = doc_cleaned.split(' ')
tokens

['i',
 'visited',
 'my',
 'grandparents',
 'last',
 'week',
 'we',
 'had',
 'a',
 'good',
 'time',
 'together']

In [8]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(doc.lower())
tokens

['i',
 'visited',
 'my',
 'grandparents',
 'last',
 'week',
 ';',
 'we',
 'had',
 'a',
 'good',
 'time',
 'together']

In [9]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(doc.lower())
tokens

['i',
 'visited',
 'my',
 'grandparents',
 'last',
 'week',
 'we',
 'had',
 'a',
 'good',
 'time',
 'together']

In [10]:
doc = '@john This product is really cool!!!😀😃😄😁😆😅 #awesome'

In [11]:
tokens = word_tokenize(doc)
tokens

['@',
 'john',
 'This',
 'product',
 'is',
 'really',
 'cool',
 '!',
 '!',
 '!',
 '😀😃😄😁😆😅',
 '#',
 'awesome']

In [12]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()
tokens = tweet_tokenizer.tokenize(doc)
tokens

['@john',
 'This',
 'product',
 'is',
 'really',
 'cool',
 '!',
 '!',
 '!',
 '😀',
 '😃',
 '😄',
 '😁',
 '😆',
 '😅',
 '#awesome']

In [13]:
## Tokenizing tweets
url = 'https://raw.githubusercontent.com/skathirmani/datasets/master/imdb_sentiment.csv'
imdb = pd.read_csv(url)
imdb.head(2)

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0


In [15]:
docs = imdb['review'].str.lower()
tokenizer = RegexpTokenizer(r'\w+')
for doc in docs.head():
  tokens = tokenizer.tokenize(doc)
  print(doc)
  print(tokens)
  print('-------')


a very, very, very slow-moving, aimless movie about a distressed, drifting young man.  
['a', 'very', 'very', 'very', 'slow', 'moving', 'aimless', 'movie', 'about', 'a', 'distressed', 'drifting', 'young', 'man']
-------
not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  
['not', 'sure', 'who', 'was', 'more', 'lost', 'the', 'flat', 'characters', 'or', 'the', 'audience', 'nearly', 'half', 'of', 'whom', 'walked', 'out']
-------
attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  
['attempting', 'artiness', 'with', 'black', 'white', 'and', 'clever', 'camera', 'angles', 'the', 'movie', 'disappointed', 'became', 'even', 'more', 'ridiculous', 'as', 'the', 'acting', 'was', 'poor', 'and', 'the', 'plot', 'and', 'lines', 'almost', 'non', 'existent']
-------
very little music or anything to speak of.  
['very', 'litt

In [16]:
docs = imdb['review'].str.lower()
docs_cleaned = []
tokenizer = RegexpTokenizer(r'\w+')
for doc in docs.head():
  tokens = tokenizer.tokenize(doc)
  docs_cleaned.append(tokens)
docs_cleaned

[['a',
  'very',
  'very',
  'very',
  'slow',
  'moving',
  'aimless',
  'movie',
  'about',
  'a',
  'distressed',
  'drifting',
  'young',
  'man'],
 ['not',
  'sure',
  'who',
  'was',
  'more',
  'lost',
  'the',
  'flat',
  'characters',
  'or',
  'the',
  'audience',
  'nearly',
  'half',
  'of',
  'whom',
  'walked',
  'out'],
 ['attempting',
  'artiness',
  'with',
  'black',
  'white',
  'and',
  'clever',
  'camera',
  'angles',
  'the',
  'movie',
  'disappointed',
  'became',
  'even',
  'more',
  'ridiculous',
  'as',
  'the',
  'acting',
  'was',
  'poor',
  'and',
  'the',
  'plot',
  'and',
  'lines',
  'almost',
  'non',
  'existent'],
 ['very', 'little', 'music', 'or', 'anything', 'to', 'speak', 'of'],
 ['the',
  'best',
  'scene',
  'in',
  'the',
  'movie',
  'was',
  'when',
  'gerardo',
  'is',
  'trying',
  'to',
  'find',
  'a',
  'song',
  'that',
  'keeps',
  'running',
  'through',
  'his',
  'head']]

In [18]:
doc = 'I visited my grandparents last week; We had a good time together'

In [19]:
import spacy
nlp = spacy.load("en_core_web_sm")

spacy_doc = nlp(doc.lower())
for token in spacy_doc:
  print(token)

i
visited
my
grandparents
last
week
;
we
had
a
good
time
together
