In [1]:
from pprint import pprint

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
from prepare import basic_clean, lemmatize

from acquire import get_articles

In [4]:
df = get_articles()

In [6]:
df.set_index('Unnamed: 0',inplace=True)

In [8]:
document = df.content[0]

In [13]:
document

'after a u jury found that elon musk did not defame british cave explorer vernon unsworth by calling him a pedo guy on twitter the tesla ceo said my faith in humanity is restored the 48yearold billionaire argued that he did not intend to call unsworth a paedophile but instead wa using pedo guy to mean creepy old guy'

In [10]:
document = basic_clean(document)

In [12]:
document = lemmatize(document)

In [14]:
words = pd.Series(document.split())

In [18]:
words.shape

(59,)

In [16]:
words.sample(10)

6           elon
54            to
47    paedophile
11       british
21           guy
13      explorer
22            on
43            to
25         tesla
12          cave
dtype: object

In [17]:
(pd.DataFrame({'raw_count': words.value_counts()})
 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))

Unnamed: 0,raw_count,frequency,augmented_frequency
guy,3,0.050847,1.0
a,3,0.050847,1.0
that,2,0.033898,0.666667
did,2,0.033898,0.666667
not,2,0.033898,0.666667
to,2,0.033898,0.666667
pedo,2,0.033898,0.666667
the,2,0.033898,0.666667
unsworth,2,0.033898,0.666667
tesla,1,0.016949,0.333333


In [21]:
tfidf = TfidfVectorizer()

In [23]:
just_content = df.content

In [29]:
just_content = just_content.apply(basic_clean)

In [30]:
just_content = just_content.apply(lemmatize)

In [31]:
tfidfs = tfidf.fit_transform(just_content.values)

In [32]:
pd.DataFrame(tfidfs.todense(), columns=tfidf.get_feature_names())

Unnamed: 0,0550,10,100,1000,10yearsi,11,12,12billion,13,135946,...,young,younger,your,yourself,yourstory,zero,zilingo,zima,zomato,zurich
0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
1,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
2,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
3,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
4,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
5,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
6,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
7,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.151703
8,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.142724,0.000000,0.0,0.000000,0.0,0.000000
9,0.0,0.000000,0.0,0.170873,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
