# Exercises


Take the work we did in the lessons further:

What other types of models (i.e. different classifcation algorithms) could you use?
How do the models compare when trained on term frequency data alone, instead of TF-IDF values?

In [5]:
from pprint import pprint
import pandas as pd
import nltk
import re
import acquire as a
import prepare as prep

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [21]:
def clean(text: str) -> list:
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = set(nltk.corpus.stopwords.words('english'))
    text = (text.encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split() # tokenization
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [6]:
categories = ["business", "sports", "technology", "entertainment", "science", "world"]
news_df = a.get_all_news_articles(categories)

In [7]:
news_df=prep.prepare_data(news_df, 'content', ['said'])

In [8]:
news_df.head()

Unnamed: 0,title,content,category,clean,stemmed,lemmatized
0,Need to have commemorative coins depicting Nee...,After javelin thrower Neeraj Chopra won a gold...,business,javelin thrower neeraj chopra gold medal tokyo...,javelin thrower neeraj chopra gold medal tokyo...,javelin thrower neeraj chopra gold medal tokyo...
1,Binance US CEO quits 3 months after joining ar...,"Brian Brooks, CEO of the US arm of world's lar...",business,brian brooks ceo us arm worlds largest cryptoc...,brian brook ceo us arm world largest cryptocur...,brian brook ceo u arm world largest cryptocurr...
2,Microsoft Co-founder Paul Allen's superyacht l...,"A 414-foot superyacht, Octopus, once owned by ...",business,414foot superyacht octopus owned microsoft cof...,414foot superyacht octopu own microsoft cofoun...,414foot superyacht octopus owned microsoft cof...
3,Melinda French Gates now owns $5.7 billion in ...,Melinda French Gates has received stocks that ...,business,melinda french gates received stocks worth 57 ...,melinda french gate receiv stock worth 57 bill...,melinda french gate received stock worth 57 bi...
4,Intend to pursue all avenues for Reliance deal...,"Future Retail has said that it ""intends to pur...",business,future retail intends pursue available avenues...,futur retail intend pursu avail avenu conclud ...,future retail intends pursue available avenue ...


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
bag_of_words = cv.fit_transform(news_df.lemmatized)
bag_of_words

<150x2593 sparse matrix of type '<class 'numpy.int64'>'
	with 4713 stored elements in Compressed Sparse Row format>

In [10]:
#to see what is inside of the sparse matrix
bag_of_words.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [14]:
#pprint(news_df.lemmatized)
pd.DataFrame(bag_of_words.todense(), columns=cv.get_feature_names()).head()

Unnamed: 0,10,100,108,108th,109172,109run,11,110,11000,112,...,young,youngster,youre,youth,youve,zaranj,zealand,zelda,zone,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidfs = tfidf.fit_transform(news_df.lemmatized)

#pprint(news_df.lemmatized)
pd.DataFrame(tfidfs.todense(), columns=tfidf.get_feature_names()).head()

Unnamed: 0,10,100,108,108th,109172,109run,11,110,11000,112,...,young,youngster,youre,youth,youve,zaranj,zealand,zelda,zone,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.123891,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Bag of Ngrams

In [18]:
cv = CountVectorizer(ngram_range=(2, 2))
bag_of_words = cv.fit_transform(news_df.lemmatized)

In [19]:
pd.DataFrame(bag_of_words.todense(), columns=cv.get_feature_names()).head()

Unnamed: 0,10 amazon,10 esa,10 kg,100 billion,100 suggests,100 suspect,108 ongoing,108th minute,109run inning,11 woman,...,zaranj sheberghan,zealand government,zealand new,zealand threeyear,zealand visa,zelda game,zone 45,zone across,zone reserve,zoom tv
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
cv = CountVectorizer()
X = cv.fit_transform(news_df.content.apply(clean).apply(' '.join))
y = news_df.category

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=12)

In [25]:
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X_train, y_train)

tree.score(X_train, y_train)

0.5416666666666666

In [26]:
# accuracy is the % of times our model predicted correctly
(tree.predict(X_train) == y_train).mean()

0.5416666666666666

In [27]:
tree.score(X_test, y_test)

0.26666666666666666