# Text service

one of the most popular task is text classification.

popular examples like
- domain classification
- sentiment analysis (sentiment label)

## example: name - sex classification

In [15]:
import collections
from nltk.corpus import names
import random

In [5]:
girl_names = names.words('female.txt')
boy_names = names.words('male.txt')

In [4]:
girl_names[:10]

['Abagael',
 'Abagail',
 'Abbe',
 'Abbey',
 'Abbi',
 'Abbie',
 'Abby',
 'Abigael',
 'Abigail',
 'Abigale']

In [6]:
len(boy_names)

2943

In [7]:
girl_names_ending_in_a = [name for name in girl_names if name.endswith('a')]
len(girl_names_ending_in_a)

1773

In [8]:
girl_names_ending_letters = collections.Counter([name[-1] for name in girl_names])

In [10]:
girl_names_ending_letters.most_common()

[('a', 1773),
 ('e', 1432),
 ('y', 461),
 ('n', 386),
 ('i', 317),
 ('l', 179),
 ('h', 105),
 ('s', 93),
 ('t', 68),
 ('r', 47),
 ('d', 39),
 ('o', 33),
 ('m', 13),
 ('g', 10),
 ('x', 10),
 ('b', 9),
 ('u', 6),
 ('w', 5),
 ('z', 4),
 ('k', 3),
 ('v', 2),
 ('p', 2),
 ('f', 2),
 (' ', 1),
 ('j', 1)]

In [11]:
boy_names_ending_letters = collections.Counter([name[-1] for name in boy_names])
boy_names_ending_letters.most_common()

[('n', 478),
 ('e', 468),
 ('y', 332),
 ('s', 230),
 ('d', 228),
 ('r', 190),
 ('l', 187),
 ('o', 165),
 ('t', 164),
 ('h', 93),
 ('m', 70),
 ('k', 69),
 ('i', 50),
 ('g', 32),
 ('a', 29),
 ('f', 25),
 ('c', 25),
 ('b', 21),
 ('p', 18),
 ('w', 17),
 ('v', 16),
 ('u', 12),
 ('z', 11),
 ('x', 10),
 ('j', 3)]

In [12]:
def extract_features(name: str):
    return {
        'last_letter': name[-1]
    }

In [18]:
boy_names_data = [(extract_features(name), 'boy') for name in boy_names]
girl_names_data = [(extract_features(name), 'girl') for name in girl_names]

In [19]:
data_agg = boy_names_data + girl_names_data
random.shuffle(data_agg)

In [22]:
cutoff = int(0.75 * len(data_agg))
train, test = data_agg[:cutoff], data_agg[cutoff+1:]

In [25]:
import nltk

In [26]:
# decision tree
name_tree = nltk.DecisionTreeClassifier.train(train)

In [30]:
examples = ['David', 'Alex', 'Alexander', 'Alexa', 'Becca', 'Rosy']
for x in examples:
    print('{}: {}'.format(x, name_tree.classify(extract_features(x))))

David: boy
Alex: girl
Alexander: boy
Alexa: girl
Becca: girl
Rosy: girl


In [31]:
# test accuracy
print(nltk.classify.accuracy(name_tree, test))

0.7682619647355163


In [33]:
# tree
print(name_tree.pretty_format())

last_letter= ? ........................................ girl
last_letter=a? ........................................ girl
last_letter=b? ........................................ boy
last_letter=c? ........................................ boy
last_letter=d? ........................................ boy
last_letter=e? ........................................ girl
last_letter=f? ........................................ boy
last_letter=g? ........................................ boy
last_letter=h? ........................................ girl
last_letter=i? ........................................ girl
last_letter=j? ........................................ boy
last_letter=k? ........................................ boy
last_letter=l? ........................................ boy
last_letter=m? ........................................ boy
last_letter=n? ........................................ boy
last_letter=o? ........................................ boy
last_letter=p? ....................

In [59]:
import string

def extract_features2(name: str):
    features = {
        'last_letter': name[-1],
        'vowel_count': len([c for c in name if c.lower() in 'aeiou']),
        'first_letter': name[0],
    }
    for c in string.ascii_lowercase:
        features['contains_' + c] = c in name
        features['count_' + c] = name.lower().count(c) 
    return features

In [66]:
def data_prep_split_train(func_feature, boy_names, girl_names):
    data = [(func_feature(name), 'boy') for name in boy_names] + [(func_feature(name), 'girl') for name in girl_names]
    random.shuffle(data)
    cutoff = int(0.75 * len(data))
    train, test = data[:cutoff], data[cutoff+1:]
    return nltk.DecisionTreeClassifier.train(train), train, test

In [67]:
name_tree2, train, test = data_prep_split_train(extract_features2, boy_names, girl_names)

In [68]:
nltk.classify.accuracy(name_tree2, test)

0.7803526448362721

In [48]:
# tree
print(name_tree2.pretty_format())

last_letter= ? ........................................ girl
last_letter=a? ........................................ girl
last_letter=b? ........................................ boy
  vowel_count=2? ...................................... boy
  vowel_count=0? ...................................... girl
  vowel_count=1? ...................................... boy
last_letter=c? ........................................ boy
last_letter=d? ........................................ boy
last_letter=e? ........................................ girl
  vowel_count=2? ...................................... girl
  vowel_count=3? ...................................... girl
  vowel_count=4? ...................................... girl
  vowel_count=5? ...................................... girl
  vowel_count=6? ...................................... girl
  vowel_count=1? ...................................... boy
last_letter=f? ........................................ boy
last_letter=g? ................

## Scikit-learn ML

In [85]:
from nltk.corpus import names
from sklearn.model_selection import train_test_split

def extract_features(name): 
    """Get the features used for name classification """
    return {
        'last_letter': name[-1] 
    }

def extract_features_big(name: str):
    features = {
        'last_letter': name[-1],
        'vowel_count': len([c for c in name if c.lower() in 'aeiou']),
        'first_letter': name[0],
    }
    for c in string.ascii_lowercase:
        features['contains_' + c] = c in name
        features['count_' + c] = name.lower().count(c) 
    return features

# Get the names
boy_names = names.words('male.txt') 
girl_names = names.words('female.txt')

# Build the dataset
boy_names_dataset = [(extract_features_big(name), 'boy') for name in boy_names] 
girl_names_dataset = [(extract_features_big(name), 'girl') for name in girl_names]

# Put all the names together
data = boy_names_dataset + girl_names_dataset 

# Split the data in features and classes
X, y = list(zip(*data))

- build dataset

In [87]:
# split and randomize
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, shuffle=True) 

- tree in sklearn

In [89]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
dict_vec = DictVectorizer()
name_tree = DecisionTreeClassifier()

In [90]:
dict_vec.fit(X_train)
X_train_vectorized = dict_vec.transform(X_train)

In [92]:
name_tree.fit(X_train_vectorized, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [94]:
name_tree.score(X_train_vectorized, y_train)

0.9652567975830816

In [93]:
X_test_vectorized = dict_vec.transform(X_test)
name_tree.score(X_test_vectorized, y_test)

0.7472306143001007

## try on existing corpus

In [95]:
from nltk.corpus import reuters
print(reuters.categories())

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [97]:
from sklearn.datasets import fetch_20newsgroups
news20 = fetch_20newsgroups(subset='train') 

  return f(*args, **kwds)
Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [102]:
news20.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## build sample dataset
- download data from pocket

In [103]:
CATEGORIES = { 'business': [
        "Business",
        "Marketing",
        "Management"
    ],
'family': [
        "Family",
        "Children",
        "Parenting"
    ], 'politics': [
        "Politics",
        "Presidential Elections",
        "Politicians",
        "Government",
        "Congress"
    ],
'sport': [
        "Baseball",
        "Basketball",
        "Running",
        "Sport",
        "Skiing",
        "Gymnastics",
        "Tenis",
        "Football",
        "Soccer"
    ], 'health': [
        "Health",
        "Weightloss",
        "Wellness",
        "Well being",
        "Vitamins",
        "Healthy Food",
        "Healthy Diet"
    ],
'economics': [
        "Economics",
        "Finance",
        "Accounting"
    ], 'celebrities': [
        "Celebrities",
        "Showbiz"
    ],
'medical': [
        "Medicine",
        "Doctors",
        "Health System",
        "Surgery",
        "Genetics",
        "Hospital"
    ],
'science & technology': [
        "Galaxy",
        "Physics",
        "Technology",
        "Science"
    ],
'information technology': [
        "Artificial Intelligence",
        "Search Engine",
        "Software",
        "Hardware",
        "Big Data",
        "Analytics",
        "Programming"
    ],
'education': [
        "Education",
        "Students",
        "University"
    ], 'media': [
        "Newspaper",
        "Reporters",
        "Social Media"
    ],
'cooking': [
        "Cooking",
        "Gastronomy",
        "Cooking Recipes",
        "Paleo Cooking",
        "Vegan Recipes"
    ], 'religion': [
        "Religion",
        "Church",
        "Spirituality"
    ],
'legal': [
        "Legal",
        "Lawyer",
        "Constitution"
    ], 'history': [
        "Archeology",
        "History",
        "Middle Ages"
    ],
'nature & ecology': [
        "Nature",
        "Ecology",
        "Endangered Species",
        "Permaculture"
    ], 'travel': [
        "Travel",
        "Tourism",
        "Globetrotter"
    ],
'meteorology': [
        "Tornado",
        "Meteorology",
        "Weather Prediction"
    ], 'automobiles': [
        "Automobiles",
        "Motorcycles",
        "Formula 1",
        "Driving"
    ],
'art & traditions': [
        "Art",
        "Artwork",
        "Traditions",
        "Artisan",
        "Pottery",
        "Painting",
        "Artist"
    ],
'beauty & fashion': [
        "Beauty",
        "Fashion",
        "Cosmetics",
        "Makeup"
    ],
'relationships': [
        "Relationships",
        "Relationship Advice",
        "Marriage",
        "Wedding"
    ], 'astrology': [
        "Astrology",
        "Zodiac",
        "Zodiac Signs",
        "Horoscope"
    ],
'diy': [
'Gardening', 'Construction', 'Decorating', 'Do it Yourself', 'Furniture'
    ]
}

In [110]:
import uuid
import atexit
import urllib
import random
import requests
import pandas as pd
from time import sleep, time
from bs4 import BeautifulSoup
from newspaper import Article, ArticleException

In [112]:

POCKET_BASE_URL = 'https://getpocket.com/explore/%s'
df = pd.DataFrame(columns=['title', 'excerpt', 'url', 'file_name', "keyword", "category"])

@atexit.register
def save_dataframe():
    """ Before exiting, make sure we save the dataframe to a CSV file """ 
    dataframe_name = "dataframe_{0}.csv".format(time()) 
    df.to_csv(dataframe_name, index=False)

# shuffle categories
categories = list(CATEGORIES.items()) 
random.shuffle(categories)

In [116]:
POCKET_BASE_URL % urllib.parse.quote_plus('david')

'https://getpocket.com/explore/david'

In [128]:
for category_name, keywords in categories:
    print('exploring category={}'.format(category_name))
    for kw in keywords:
        result = requests.get(POCKET_BASE_URL % urllib.parse.quote_plus(kw))
        soup = BeautifulSoup(result.content, features='html')
        media_items = soup.find_all(attrs={"class": 'media_item'})
        for item_html in media_items:
            title_html = item_html.find_all(attrs={'class': 'title'})[0] 
            title = title_html.text
            
            url = title_html.a['data-saveurl']
            print("Indexing article: \"{0}\" from \"{1}\"".format(title, url))
            
            excerpt = item_html.find_all(attrs={'class': 'excerpt'})[0].text
            try:
                article = Article(url)
                article.download()
                article.parse()
                content = article.text
            except ArticleException as e:
                print("Encoutered exception when parsing \"{0}\": \"{1}\"".format(url, str(e))) 
                continue
            
            if not content:
                print("Couldn't extract content from \"{0}\"".format(url)) 
                continue
                
            file_name = "{0}.txt".format(str(uuid.uuid4()))
            with open('/Users/shihaozhang/Desktop/ML_tech/nlp_for_hackers/{0}'.format(file_name), 'w+') as text_file:
                text_file.write(content)

            # Append the row in our dataframe
            df.loc[len(df)] = [title, excerpt, url, file_name, kw, category_name] 
            # Need to sleep in order to not get blocked
            sleep(random.randint(5, 15))            

exploring category=legal
Indexing article: "The Lawyer Whose Clients Didn’t Exist" from "https://www.theatlantic.com/magazine/archive/2020/05/bp-oil-spill-shrimpers-settlement/609082/"
Indexing article: "Uber Says Engineer Is on His Own for $180 Million to Google" from "https://www.bloomberg.com/news/articles/2020-04-18/uber-says-guilty-engineer-on-his-own-for-180-million-to-google"
Indexing article: "Lawyers Get Pay Cuts, Furloughs as Firms Grapple With Downturn" from "https://www.bloomberg.com/news/articles/2020-04-18/lawyers-get-pay-cuts-furloughs-as-firms-grapple-with-downturn"
Indexing article: "Nobel laureates condemn 'judicial harassment' of environmental lawyer" from "https://www.theguardian.com/world/2020/apr/18/nobel-laureates-condemn-judicial-harassment-of-environmental-lawyer"
Indexing article: "Covid-19 checkpoints targeting out-of-state residents draw complaints and legal scrutiny" from "https://www.washingtonpost.com/local/trafficandcommuting/covid-19-checkpoints-targeti

KeyboardInterrupt: 

- load data from file instead of downloading (took too long)

## Text Feature Extractor

- bigram / 3gram

In [129]:
text = """
How much wood does a woodchuck chuck if a woodchuck could chuck wood
"""

In [132]:
w2g_features = collections.Counter(nltk.bigrams(text.lower().split()))
w3g_features = collections.Counter(nltk.trigrams(text.lower().split()))

In [134]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(lowercase=True)

In [136]:
vectorizer.fit([text])
print(vectorizer.transform([text]))

  (0, 0)	2
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	2
  (0, 7)	2


In [139]:
vectorizer.vocabulary_

{'how': 3,
 'much': 5,
 'wood': 6,
 'does': 2,
 'woodchuck': 7,
 'chuck': 0,
 'if': 4,
 'could': 1}

In [144]:
# out of vocab text
result = vectorizer.transform(["Unseen words", "BLT sandwich"]) 
print("matrix: {}" .format(result))
print(result.shape)

matrix: 
(2, 8)


In [150]:
result.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0]])

## Bayes

In [169]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

pd.set_option('max_colwidth', 100000)

print(MultinomialNB.__doc__[:415])


    Naive Bayes classifier for multinomial models

    The multinomial Naive Bayes classifier is suitable for classification with
    discrete features (e.g., word counts for text classification). The
    multinomial distribution normally requires integer feature counts. However,
    in practice, fractional counts such as tf-idf may also work.

    Read more in the :ref:`User Guide <multinomial_naive_bayes>`.




- data collection & train test split

In [155]:
base_url = '/Users/shihaozhang/Desktop/ML_tech/nlp_for_hackers'

In [157]:
data = pd.read_csv('{}/text_analysis_data.csv'.format(base_url))

In [161]:
data.columns

Index(['Unnamed: 0', 'title', 'excerpt', 'url', 'file_name', 'keyword',
       'category'],
      dtype='object')

In [175]:
text_samples, labels = [], []
for idx, row in data.iterrows():
    with open('./clean_data/{0}'.format(row['file_name']), 'r') as text_file:
        text = text_file.read()
        text_samples.append(text)
        labels.append(row['category'])

In [184]:
X_train, X_test, y_train, y_test = train_test_split(text_samples, labels, test_size=0.2, shuffle=True)

- featurization

In [187]:
vectorizer = CountVectorizer(lowercase=True)
vectorizer.fit(X_train)
X_train_vectorized = vectorizer.transform(X_train)

In [197]:
# train
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [198]:
X_test_vectorizaed = vectorizer.transform(X_test)
print('train accuracy: {}'.format(model.score(X_train_vectorized, y_train)))
print('test accuracy: {}'.format(model.score(X_test_vectorizaed, y_test)))

train accuracy: 0.811314791403287
test accuracy: 0.6449778900821226


## classifying using MultinomialNB

In [199]:
import random
random_choice = random.randint(0, len(X_test))
text, label = X_test[random_choice], y_test[random_choice]

In [212]:
for i in range(2):
    random_choice = random.randint(0, len(X_test))
    text, label = X_test[random_choice], y_test[random_choice]
    print(text, '\n------------------------------------')
    print('label={}'.format(label))

Humanity has conquered the world. It's hard to appreciate what that means, but the video above, by WorldPopulationHistory.org, shows just how incredible the growth and expansion of humanity has been over the past 2,000 years.

Here are some of the notable moments in the video:

The map begins at 1:18, showing human population a little more than 2,000 years ago, with each yellow dot representing 1 million people in an area. At this point, there are 170 million people on Earth.

At 3:20, the Mongol invasion of China begins in the early 13th century, killing huge segments of the population. The Mongol conquests are still considered one of the deadliest wars in history, killing tens of millions of people at a time when the world population was much smaller — around 360 million.

At 3:30, in the 14th century, the Black Death spreads around the world, killing more than 20 million people in Europe — nearly one-third of the continent's population — and 75 million around the world, when the glo

In [206]:
text_vectorized = vectorizer.transform([text])
print(model.predict(text_vectorized))

['meteorology']


- persisting model

In [208]:
import time
from sklearn.externals import joblib

timestamp = int(time.time())
# Save the vectorizer
joblib.dump(vectorizer, './text_analysis_vectorizer_%s.joblib' % timestamp)
# Save the classifier
joblib.dump(model, './text_analysis_classifier_%s.joblib' % timestamp)

['./text_analysis_classifier_1587325013.joblib']

- loading model

In [210]:
# Load the vectorizer
vectorizer = joblib.load('./text_analysis_vectorizer_%s.joblib' % timestamp) 
# Load the classifier
classifier = joblib.load('./text_analysis_classifier_%s.joblib' % timestamp)
# Test the loaded cobmponent by classifying a text
print(classifier.predict(vectorizer.transform([text])))

['meteorology']
