In [26]:
# Import necessary modules
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import regexp_tokenize
import re

# import nltk

# nltk.download("punkt")

scene_one = "In this exercise, you'll utilize re.search() and re.match() to find specific tokens. Both search and match expect regex patterns, similar to those you defined in an earlier exercise. You'll apply these regex library methods to the same Monty Python text from the nltk corpora. You have both scene_one and sentences available from the last exercise; now you can use them with [re.search()] and re.match() to extract and match more text."

# Split scene_one into sentences: sentences
sentences = sent_tokenize(scene_one)
sentences

["In this exercise, you'll utilize re.search() and re.match() to find specific tokens.",
 'Both search and match expect regex patterns, similar to those you defined in an earlier exercise.',
 "You'll apply these regex library methods to the same Monty Python text from the nltk corpora.",
 'You have both scene_one and sentences available from the last exercise; now you can use them with [re.search()] and re.match() to extract and match more text.']

In [27]:
# Use word_tokenize to tokenize the fourth sentence: tokenized_sent
tokenized_sent = word_tokenize(sentences[3])

tokenized_sent

['You',
 'have',
 'both',
 'scene_one',
 'and',
 'sentences',
 'available',
 'from',
 'the',
 'last',
 'exercise',
 ';',
 'now',
 'you',
 'can',
 'use',
 'them',
 'with',
 '[',
 're.search',
 '(',
 ')',
 ']',
 'and',
 're.match',
 '(',
 ')',
 'to',
 'extract',
 'and',
 'match',
 'more',
 'text',
 '.']

In [None]:

# Make a set of unique tokens in the entire scene: unique_tokens
unique_tokens = set(word_tokenize(scene_one))

# Print the unique tokens result
print(unique_tokens)

In [15]:
# Search for the first occurrence of "coconuts" in scene_one: match
match = re.search("specific", scene_one)

# Print the start and end indexes of match
print(match.start(), match.end(), match)

68 76 <re.Match object; span=(68, 76), match='specific'>


In [17]:
# Write a regular expression to search for anything in square brackets: pattern1
pattern1 = r"\[.*]"

# Use re.search to find the first text in square brackets
print(re.search(pattern1, scene_one).group()[1:-1])

re.search()


In [19]:
my_str = 'match lowercase spaces nums like 12, but no commas'

re.match('[a-z0-9 ]+', my_str)

<re.Match object; span=(0, 35), match='match lowercase spaces nums like 12'>

In [22]:
match_digits_and_words = "(\d+|\w+|\.)"

re.findall(match_digits_and_words, "He has 11 cats.")

['He', 'has', '11', 'cats', '.']

In [24]:
my_string = "SOLDIER #1: Found them? In Mercea? The coconut's tropical!"
pattern = r"(\w+|#\d|\?|!)"
regexp_tokenize(my_string, pattern)

['SOLDIER',
 '#1',
 'Found',
 'them',
 '?',
 'In',
 'Mercea',
 '?',
 'The',
 'coconut',
 's',
 'tropical',
 '!']

In [25]:
from nltk.tokenize import TweetTokenizer
tweets = ['This is the best #nlp exercise ive found online! #python',
          '#NLP is super fun! <3 #learning',
          'Thanks @datacamp :) #nlp #python']

# Use the TweetTokenizer to tokenize all tweets into one list
tknzr = TweetTokenizer()
all_tokens = [tknzr.tokenize(t) for t in tweets]
print(all_tokens)

[['This', 'is', 'the', 'best', '#nlp', 'exercise', 'ive', 'found', 'online', '!', '#python'], ['#NLP', 'is', 'super', 'fun', '!', '<3', '#learning'], ['Thanks', '@datacamp', ':)', '#nlp', '#python']]


## Bag-of-Word

In [30]:
from collections import Counter

Counter(word_tokenize(scene_one.lower())).most_common(10)

[('you', 5),
 ('and', 5),
 ('(', 4),
 (')', 4),
 ('to', 4),
 ('.', 4),
 ('exercise', 3),
 ('the', 3),
 ('in', 2),
 (',', 2)]

In [31]:
tokens = [w for w in word_tokenize(scene_one.lower()) if w.isalpha()]
Counter(tokens).most_common(10)

[('you', 5),
 ('and', 5),
 ('to', 4),
 ('exercise', 3),
 ('the', 3),
 ('in', 2),
 ('both', 2),
 ('match', 2),
 ('regex', 2),
 ('text', 2)]

In [34]:
from nltk.corpus import stopwords

no_stops = [t for t in tokens if t not in stopwords.words("english")]

Counter(no_stops).most_common(10)

[('exercise', 3),
 ('match', 2),
 ('regex', 2),
 ('text', 2),
 ('utilize', 1),
 ('find', 1),
 ('specific', 1),
 ('tokens', 1),
 ('search', 1),
 ('expect', 1)]

In [38]:
# Import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer

# Retain alphabetic words: alpha_only
alpha_only = [t for t in tokens if t.isalpha()]

# Remove all stop words: no_stops
no_stops = [t for t in alpha_only if t not in stopwords.words("english")]

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]

# Create the bag-of-words: bow
bow = Counter(lemmatized)

# Print the 10 most common tokens
bow.most_common(10)

[('exercise', 3),
 ('match', 2),
 ('regex', 2),
 ('text', 2),
 ('utilize', 1),
 ('find', 1),
 ('specific', 1),
 ('token', 1),
 ('search', 1),
 ('expect', 1)]

## Gensim

In [42]:
from gensim.corpora.dictionary import Dictionary

my_documents = [
    "The movie was about a spaceship and aliens.",
    "I really liked the movie!",
    "Awesome action scenes, but boring characters.",
    "The movie was awful! I hate alien films.",
    "Space is cool! I liked the movie.",
    "More space films, please!",
]

tokenized_docs = [word_tokenize(doc.lower()) for doc in my_documents]

dictionary = Dictionary(tokenized_docs)

dictionary.token2id

{'.': 0,
 'a': 1,
 'about': 2,
 'aliens': 3,
 'and': 4,
 'movie': 5,
 'spaceship': 6,
 'the': 7,
 'was': 8,
 '!': 9,
 'i': 10,
 'liked': 11,
 'really': 12,
 ',': 13,
 'action': 14,
 'awesome': 15,
 'boring': 16,
 'but': 17,
 'characters': 18,
 'scenes': 19,
 'alien': 20,
 'awful': 21,
 'films': 22,
 'hate': 23,
 'cool': 24,
 'is': 25,
 'space': 26,
 'more': 27,
 'please': 28}

## Creating a bag-of-word corpus

In [40]:
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(5, 1), (7, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(0, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)],
 [(0, 1),
  (5, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1)],
 [(0, 1), (5, 1), (7, 1), (9, 1), (10, 1), (11, 1), (24, 1), (25, 1), (26, 1)],
 [(9, 1), (13, 1), (22, 1), (26, 1), (27, 1), (28, 1)]]

## Tf-idf with gensim

Term frequency - inverse document frequency. Allows you to determine the most important words in each document. Each corpus may have shared words beyond just stopwords

In [43]:
from gensim.models.tfidfmodel import TfidfModel

tfidf = TfidfModel(corpus)
tfidf[corpus[1]]

[(5, 0.1746298276735174),
 (7, 0.1746298276735174),
 (9, 0.1746298276735174),
 (10, 0.29853166221463673),
 (11, 0.47316148988815415),
 (12, 0.7716931521027908)]

In [45]:
# Sort the weights from highest to lowest: sorted_tfidf_weights
tfidf_weights = tfidf[1]
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

# Print the top 5 weighted words
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)

TypeError: 'int' object is not iterable