In [1]:
import nltk

## Example for Pos_Tag

In [2]:
uc1 = "The University of Chicago is a private research university in Chicago, Illinois"

In [3]:
text = nltk.tokenize.word_tokenize(uc1)
nltk.pos_tag(text)

[('The', 'DT'),
 ('University', 'NNP'),
 ('of', 'IN'),
 ('Chicago', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('private', 'JJ'),
 ('research', 'NN'),
 ('university', 'NN'),
 ('in', 'IN'),
 ('Chicago', 'NNP'),
 (',', ','),
 ('Illinois', 'NNP')]

In [4]:
nltk.help.upenn_tagset('NNP')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


## Get raw text from a url, and do nlp
- Preprocessing
- Get token Frq
- Get bigram Frq
- Get trigram Frq

In [5]:
url = "http://www.zhaimobile.com"
from bs4 import BeautifulSoup
import urllib.request
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page.read(), "lxml")
raw = (soup.get_text())


In [6]:
raw_words = nltk.word_tokenize(raw)
# Clean words
words = [word for word in raw_words if len(word) > 1]
words = [word for word in words if word.isalpha()]
words = [w.lower() for w in words if w.isalnum()]
# Stop words
stopwords = set(nltk.corpus.stopwords.words('english'))
words = [word for word in words if word not in stopwords]
# Lemma or Stem; use Lemmatizer for better results
wnl = nltk.WordNetLemmatizer()
cleaned_words = [wnl.lemmatize(t) for t in words]


In [7]:
fdist = nltk.FreqDist(cleaned_words)
fdist.most_common(10)

[('development', 8),
 ('data', 6),
 ('web', 6),
 ('skill', 6),
 ('game', 6),
 ('mo', 5),
 ('zhai', 5),
 ('science', 5),
 ('software', 5),
 ('solid', 5)]

In [8]:
bgs = [b for b in nltk.bigrams(cleaned_words)]
fdist_bgs = nltk.FreqDist(bgs)
fdist_bgs.most_common(10)

[(('mo', 'zhai'), 3),
 (('data', 'science'), 3),
 (('programming', 'skill'), 3),
 (('side', 'project'), 3),
 (('science', 'instructor'), 2),
 (('software', 'engineer'), 2),
 (('browser', 'support'), 2),
 (('support', 'video'), 2),
 (('video', 'tag'), 2),
 (('tag', 'suggest'), 2)]

In [9]:
tgs = [b for b in nltk.trigrams(cleaned_words)]
fdist_tgs = nltk.FreqDist(tgs)
fdist_tgs.most_common(10)

[(('data', 'science', 'instructor'), 2),
 (('browser', 'support', 'video'), 2),
 (('support', 'video', 'tag'), 2),
 (('video', 'tag', 'suggest'), 2),
 (('tag', 'suggest', 'upgrade'), 2),
 (('suggest', 'upgrade', 'browser'), 2),
 (('python', 'swift', 'sql'), 2),
 (('enterprise', 'web', 'application'), 2),
 (('skill', 'enthusiastic', 'heart'), 2),
 (('knowledge', 'programming', 'skill'), 2)]

## Exploring TextBlob

In [10]:
from textblob import TextBlob

In [11]:
blob = TextBlob(raw)

In [12]:
blob.tags[:10]

[('Mo', 'NNP'),
 ('Zhai', 'NNP'),
 ('|', 'NNP'),
 ('翟墨', 'NNP'),
 ('Mo', 'NNP'),
 ('Zhai', 'NNP'),
 ('Data', 'NNP'),
 ('Science', 'NNP'),
 ('Instructor', 'NNP'),
 ('Software', 'NNP')]

In [13]:
blob.noun_phrases[:10]

WordList(['mo zhai', '| 翟墨', 'mo zhai data', 'instructor software engineer', 'zhai @ uchicago.edu', 'mo', 'portfolio contact linkedin github twitter email résumé', 'mo crescat', 'scientia vita excolatur', 'loading coverr'])

In [15]:
b_sentences = blob.sentences
print (b_sentences[24:26])

[Sentence(".NET
[C#, ASP.NET, Entity Framework, Sql Server]
Implemented variety web services (APIs) applications for mobile apps."), Sentence("Working on business web applicaion and side projects using MVC and Enitty Framowrk.")]


#### To process  cleaned-up version from NLTK we will have to convert text from nltk.text.Text to String

In [16]:
words_list = (cleaned_words[0:])
words_string = ' '.join(words_list)

In [17]:
blob = TextBlob(words_string)
blob.tags[:10]

[('mo', 'NN'),
 ('zhai', 'NN'),
 ('翟墨', 'NNP'),
 ('mo', 'NN'),
 ('zhai', 'NN'),
 ('data', 'NNS'),
 ('science', 'NN'),
 ('instructor', 'NN'),
 ('software', 'NN'),
 ('engineer', 'NN')]

In [18]:
blob.words[:10]

WordList(['mo', 'zhai', '翟墨', 'mo', 'zhai', 'data', 'science', 'instructor', 'software', 'engineer'])

In [19]:
blob.ngrams(2)[:10]

[WordList(['mo', 'zhai']),
 WordList(['zhai', '翟墨']),
 WordList(['翟墨', 'mo']),
 WordList(['mo', 'zhai']),
 WordList(['zhai', 'data']),
 WordList(['data', 'science']),
 WordList(['science', 'instructor']),
 WordList(['instructor', 'software']),
 WordList(['software', 'engineer']),
 WordList(['engineer', 'zhai'])]

In [20]:
blob.ngrams(3)[:10]

[WordList(['mo', 'zhai', '翟墨']),
 WordList(['zhai', '翟墨', 'mo']),
 WordList(['翟墨', 'mo', 'zhai']),
 WordList(['mo', 'zhai', 'data']),
 WordList(['zhai', 'data', 'science']),
 WordList(['data', 'science', 'instructor']),
 WordList(['science', 'instructor', 'software']),
 WordList(['instructor', 'software', 'engineer']),
 WordList(['software', 'engineer', 'zhai']),
 WordList(['engineer', 'zhai', 'mo'])]

In [21]:
blob.ngrams(4)[:10]

[WordList(['mo', 'zhai', '翟墨', 'mo']),
 WordList(['zhai', '翟墨', 'mo', 'zhai']),
 WordList(['翟墨', 'mo', 'zhai', 'data']),
 WordList(['mo', 'zhai', 'data', 'science']),
 WordList(['zhai', 'data', 'science', 'instructor']),
 WordList(['data', 'science', 'instructor', 'software']),
 WordList(['science', 'instructor', 'software', 'engineer']),
 WordList(['instructor', 'software', 'engineer', 'zhai']),
 WordList(['software', 'engineer', 'zhai', 'mo']),
 WordList(['engineer', 'zhai', 'mo', 'portfolio'])]

## Languages and Trslation

In [22]:
blob.detect_language()

'en'

In [23]:
b = TextBlob("Simple is better than complex")
b.translate(to="es")

TextBlob("Simple es mejor que complejo")

In [24]:
b = TextBlob("Simple is better than complex")
b.translate(to="fr")

TextBlob("Simple vaut mieux que complexe")

## TOPWORDS - Using TF-IDF

In [25]:
document1 = TextBlob(raw)

document2 = TextBlob("""Python, from the Greek word (πύθων/πύθωνας), is a genus of
nonvenomous pythons[2] found in Africa and Asia. Currently, 7 species are
recognised.[2] A member of this genus, P. reticulatus, is among the longest
snakes known.""")

document3 = TextBlob("""The Colt Python is a .357 Magnum caliber revolver formerly
manufactured by Colt's Manufacturing Company of Hartford, Connecticut.
It is sometimes referred to as a "Combat Magnum".[1] It was first introduced
in 1955, the same year as Smith &amp; Wesson's M29 .44 Magnum. The now discontinued
Colt Python targeted the premium revolver market segment. Some firearm
collectors and writers such as Jeff Cooper, Ian V. Hogg, Chuck Hawks, Leroy
Thompson, Renee Smeets and Martin Dougherty have described the Python as the
finest production revolver ever made.""")

bloblist = [document1, document2, document3]

In [26]:
import math
def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

In [26]:
for i, blob in enumerate(bloblist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:20]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in document 1
	Word: I, TF-IDF: 0.00636
	Word: Development, TF-IDF: 0.00636
	Word: development, TF-IDF: 0.00636
	Word: C, TF-IDF: 0.00557
	Word: on, TF-IDF: 0.00557
	Word: Data, TF-IDF: 0.00477
	Word: data, TF-IDF: 0.00477
	Word: web, TF-IDF: 0.00477
	Word: skills, TF-IDF: 0.00477
	Word: for, TF-IDF: 0.00477
	Word: Game, TF-IDF: 0.00477
	Word: Skills, TF-IDF: 0.00477
	Word: Web, TF-IDF: 0.00477
	Word: game, TF-IDF: 0.00477
	Word: Mo, TF-IDF: 0.00398
	Word: Zhai, TF-IDF: 0.00398
	Word: Science, TF-IDF: 0.00398
	Word: Software, TF-IDF: 0.00398
	Word: zhai, TF-IDF: 0.00398
	Word: solid, TF-IDF: 0.00398
Top words in document 2
	Word: genus, TF-IDF: 0.02192
	Word: 2, TF-IDF: 0.02192
	Word: A, TF-IDF: 0.02192
	Word: Greek, TF-IDF: 0.01096
	Word: word, TF-IDF: 0.01096
	Word: πύθων/πύθωνας, TF-IDF: 0.01096
	Word: nonvenomous, TF-IDF: 0.01096
	Word: pythons, TF-IDF: 0.01096
	Word: found, TF-IDF: 0.01096
	Word: Africa, TF-IDF: 0.01096
	Word: Asia, TF-IDF: 0.01096
	Word: Currently, TF-I