In [1]:
from pprint import pprint

import pandas as pd
import numpy as np
import acquire
from prepare import basic_clean, tokenize, remove_stopwords, lemmatize
import re

%matplotlib inline
import matplotlib.pyplot as plt

* Take the work we did in the lessons further:
    * What other types of models (i.e. different classifcation algorithms) could you use?
    * How do the models compare when trained on term frequency data alone, instead of TF-IDF values?

In [2]:
spam_df = pd.read_csv("spam.csv")
spam_df.head()

Unnamed: 0,id,label,text
0,0,ham,"Go until jurong point, crazy.. Available only ..."
1,1,ham,Ok lar... Joking wif u oni...
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,3,ham,U dun say so early hor... U c already then say...
4,4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
spam_df.text = spam_df.text.apply(basic_clean)
spam_df.text = spam_df.text.apply(tokenize)
spam_df.text = spam_df.text.apply(remove_stopwords, extra_words=["r", "u", "2", "ltgt"])
spam_df.text = spam_df.text.apply(lemmatize)

spam_df.head()

Unnamed: 0,id,label,text
0,0,ham,go jurong point crazy available bugis n great ...
1,1,ham,ok lar joking wif oni
2,2,spam,free entry wkly comp win fa cup final tkts 21s...
3,3,ham,dun say early hor c already say
4,4,ham,nah ' think go usf life around though


# Raw count, TF

In [4]:
# join all the words for the text column into a giant string
all_text = " ".join(spam_df.text)

In [9]:
# Create a list of individual words
word_list = pd.Series(re.sub(r"[^\w\s]", "", all_text).split())

In [14]:
# Make dataframe with raw count and frequencies
word_freq_df = (pd.DataFrame({'raw_count': word_list.value_counts()})
 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))

word_freq_df

Unnamed: 0,raw_count,frequency,augmented_frequency
call,600,0.012287,1.000000
get,397,0.008130,0.661667
ur,384,0.007864,0.640000
go,304,0.006226,0.506667
4,288,0.005898,0.480000
...,...,...,...
5ish,1,0.000020,0.001667
okors,1,0.000020,0.001667
thenwill,1,0.000020,0.001667
2optoutd3wv,1,0.000020,0.001667


* Augmented frequency is the word's frequency divided by the max frequency of any word within the bag of words
* TF == frequency

# IDF
* Inverse document frequency
    * A word that appears more frequently across multiple documents will have a lower IDF than a word that appears frequently within a few documents

In [29]:
# Calculate IDF
def idf(word, document_series):
    n_occurences = sum([1 for doc in document_series if word in doc])
    
    return np.log(len(document_series) / n_occurences)

In [25]:
word_freq_df = word_freq_df.reset_index()
word_freq_df = word_freq_df.rename(columns={'index' : 'word'})

In [30]:
word_freq_df['idf'] = word_freq_df.word.apply(idf, document_series=spam_df.text)
word_freq_df

Unnamed: 0,word,raw_count,frequency,augmented_frequency,idf
0,call,600,0.012287,1.000000,2.170311
1,get,397,0.008130,0.661667,2.435194
2,ur,384,0.007864,0.640000,1.845587
3,go,304,0.006226,0.506667,1.613394
4,4,288,0.005898,0.480000,2.134786
...,...,...,...,...,...
8840,5ish,1,0.000020,0.001667,8.625509
8841,okors,1,0.000020,0.001667,8.625509
8842,thenwill,1,0.000020,0.001667,8.625509
8843,2optoutd3wv,1,0.000020,0.001667,8.625509


# TF-IDF
* The product of term frequency and inverse document frequency

In [31]:
word_freq_df['tf_idf'] = word_freq_df.frequency * word_freq_df.idf
word_freq_df

Unnamed: 0,word,raw_count,frequency,augmented_frequency,idf,tf_idf
0,call,600,0.012287,1.000000,2.170311,0.026667
1,get,397,0.008130,0.661667,2.435194,0.019798
2,ur,384,0.007864,0.640000,1.845587,0.014513
3,go,304,0.006226,0.506667,1.613394,0.010044
4,4,288,0.005898,0.480000,2.134786,0.012591
...,...,...,...,...,...,...
8840,5ish,1,0.000020,0.001667,8.625509,0.000177
8841,okors,1,0.000020,0.001667,8.625509,0.000177
8842,thenwill,1,0.000020,0.001667,8.625509,0.000177
8843,2optoutd3wv,1,0.000020,0.001667,8.625509,0.000177


# TF-IDF with scikit-learn

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
tfidf = TfidfVectorizer()
tfidfs = tfidf.fit_transform(word_freq_df.word)

In [35]:
pd.DataFrame(tfidfs.todense(), columns=tfidf.get_feature_names())

Unnamed: 0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zebra,zed,zero,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8840,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8841,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Modeling