In [64]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline  

In [75]:
e0 = "Hi, we'd like to follow up on the project proposal by 5pm later. Please email us by then."
e1 = "It's raining in Metro Manila, we need to bring umbrellas. It'll be good to be prepared."
e2 = "I solemnly swear that I am up to no good."
e3 = "You won PHP 100000000 in the Super Lotto! Please give us your SSS ID and TIN information to claim."
e4 = "7 items from your Steam wishlist are on sale. Please click the link in your email."
e5 = "Someone once counseled not to dwell on the things you didn’t get after praying."
e6 = "A symbol of peace who saves people lives with a smile."

In [76]:
email_list = [e0,e1,e2,e3,e4,e5,e6]

In [77]:
# instantiate the vectorizer object
vectorizer = CountVectorizer(stop_words="english")

# convert the documents into a document-term matrix
bag_of_words = vectorizer.fit_transform(email_list)

In [78]:
bag_of_words

<7x46 sparse matrix of type '<class 'numpy.int64'>'
	with 48 stored elements in Compressed Sparse Row format>

In [79]:
def bag2df(bag, feat_names):
    
    # create an index for each row
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(bag)]
    df = pd.DataFrame(data=bag.toarray(), index=doc_names,
                      columns=feat_names)
    return(df)

In [70]:
# retrieve the terms found in the corpora
tokens = vectorizer.get_feature_names()

# create a dataframe from the matrix
bag2df(bag_of_words, tokens)

Unnamed: 0,100000000,5pm,bring,claim,counseled,didn,dwell,follow,good,hi,...,sss,steam,super,swear,symbol,things,tin,umbrellas,wishlist,won
Doc0,0,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
Doc1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
Doc2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
Doc3,1,0,0,1,0,0,0,0,0,0,...,1,0,1,0,0,0,1,0,0,1
Doc4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
Doc5,0,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Doc6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [80]:
tfidf_vec = TfidfVectorizer(stop_words="english")
transformed = tfidf_vec.fit_transform(email_list)

In [81]:
index_value={i[1]:i[0] for i in tfidf_vec.vocabulary_.items()}
fully_indexed = []
for row in transformed:
    fully_indexed.append({index_value[column]:value for (column,value) in zip(row.indices,row.data)})

In [85]:
fully_indexed

[{'hi': 0.3606316736641664,
  'like': 0.3606316736641664,
  'follow': 0.3606316736641664,
  'project': 0.3606316736641664,
  'proposal': 0.3606316736641664,
  '5pm': 0.3606316736641664,
  'later': 0.3606316736641664,
  'email': 0.29935525993587564},
 {'raining': 0.3392454547140992,
  'metro': 0.3392454547140992,
  'manila': 0.3392454547140992,
  'need': 0.3392454547140992,
  'bring': 0.3392454547140992,
  'umbrellas': 0.3392454547140992,
  'll': 0.3392454547140992,
  'good': 0.2816028615738706,
  'prepared': 0.3392454547140992},
 {'good': 0.5062023856012858,
  'solemnly': 0.6098192948782316,
  'swear': 0.6098192948782316},
 {'won': 0.31622776601683794,
  'php': 0.31622776601683794,
  '100000000': 0.31622776601683794,
  'super': 0.31622776601683794,
  'lotto': 0.31622776601683794,
  'sss': 0.31622776601683794,
  'id': 0.31622776601683794,
  'tin': 0.31622776601683794,
  'information': 0.31622776601683794,
  'claim': 0.31622776601683794},
 {'email': 0.32095270940344806,
  'items': 0.3866