In [87]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline  

In [88]:
e0 = "Hi, we'd like to follow up on the project proposal by 5pm later. Please email us by then."
e1 = "It's raining in Metro Manila, we need to bring umbrellas. It'll be good to be prepared."
e2 = "I solemnly swear that I am up to no good."
e3 = "You won PHP 100000000 in the Super Lotto! Please give us your SSS ID and TIN information to claim."
e4 = "7 items from your Steam wishlist are on sale. Please click the link in your email."
e5 = "Someone once counseled not to dwell on the things you didn’t get after praying."
e6 = "A symbol of peace who saves people lives with a smile."

In [89]:
email_list = [e0,e1,e2,e3,e4,e5,e6]

In [90]:
# instantiate the vectorizer object
vectorizer = CountVectorizer(stop_words="english")

# convert the documents into a document-term matrix
bag_of_words = vectorizer.fit_transform(email_list)

In [91]:
bag_of_words

<7x46 sparse matrix of type '<class 'numpy.int64'>'
	with 48 stored elements in Compressed Sparse Row format>

In [92]:
def bag2df(bag, feat_names):
    
    # create an index for each row
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(bag)]
    df = pd.DataFrame(data=bag.toarray(), index=doc_names,
                      columns=feat_names)
    return(df)

In [93]:
# retrieve the terms found in the corpora
tokens = vectorizer.get_feature_names()

# create a dataframe from the matrix
bag2df(bag_of_words, tokens)

Unnamed: 0,100000000,5pm,bring,claim,click,counseled,didn,dwell,email,follow,...,sss,steam,super,swear,symbol,things,tin,umbrellas,wishlist,won
Doc0,0,1,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
Doc1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
Doc2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Doc3,1,0,0,1,0,0,0,0,0,0,...,1,0,1,0,0,0,1,0,0,1
Doc4,0,0,0,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
Doc5,0,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
Doc6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [80]:
tfidf_vec = TfidfVectorizer(stop_words="english")
transformed = tfidf_vec.fit_transform(email_list)

In [81]:
index_value={i[1]:i[0] for i in tfidf_vec.vocabulary_.items()}
fully_indexed = []
for row in transformed:
    fully_indexed.append({index_value[column]:value for (column,value) in zip(row.indices,row.data)})

In [114]:
index_value

{11: 'hi',
 16: 'like',
 9: 'follow',
 29: 'project',
 30: 'proposal',
 1: '5pm',
 15: 'later',
 8: 'email',
 31: 'raining',
 22: 'metro',
 21: 'manila',
 23: 'need',
 2: 'bring',
 43: 'umbrellas',
 19: 'll',
 10: 'good',
 28: 'prepared',
 35: 'solemnly',
 39: 'swear',
 45: 'won',
 26: 'php',
 0: '100000000',
 38: 'super',
 20: 'lotto',
 36: 'sss',
 12: 'id',
 42: 'tin',
 13: 'information',
 3: 'claim',
 14: 'items',
 37: 'steam',
 44: 'wishlist',
 32: 'sale',
 4: 'click',
 17: 'link',
 5: 'counseled',
 7: 'dwell',
 41: 'things',
 6: 'didn',
 27: 'praying',
 40: 'symbol',
 24: 'peace',
 33: 'saves',
 25: 'people',
 18: 'lives',
 34: 'smile'}

In [113]:
tfidf_vec.vocabulary_.items()

dict_items([('hi', 11), ('like', 16), ('follow', 9), ('project', 29), ('proposal', 30), ('5pm', 1), ('later', 15), ('email', 8), ('raining', 31), ('metro', 22), ('manila', 21), ('need', 23), ('bring', 2), ('umbrellas', 43), ('ll', 19), ('good', 10), ('prepared', 28), ('solemnly', 35), ('swear', 39), ('won', 45), ('php', 26), ('100000000', 0), ('super', 38), ('lotto', 20), ('sss', 36), ('id', 12), ('tin', 42), ('information', 13), ('claim', 3), ('items', 14), ('steam', 37), ('wishlist', 44), ('sale', 32), ('click', 4), ('link', 17), ('counseled', 5), ('dwell', 7), ('things', 41), ('didn', 6), ('praying', 27), ('symbol', 40), ('peace', 24), ('saves', 33), ('people', 25), ('lives', 18), ('smile', 34)])

In [103]:
list(zip(transformed.indices, transformed.data))

[(11, 0.3606316736641664),
 (16, 0.3606316736641664),
 (9, 0.3606316736641664),
 (29, 0.3606316736641664),
 (30, 0.3606316736641664),
 (1, 0.3606316736641664),
 (15, 0.3606316736641664),
 (8, 0.29935525993587564),
 (31, 0.3392454547140992),
 (22, 0.3392454547140992),
 (21, 0.3392454547140992),
 (23, 0.3392454547140992),
 (2, 0.3392454547140992),
 (43, 0.3392454547140992),
 (19, 0.3392454547140992),
 (10, 0.2816028615738706),
 (28, 0.3392454547140992),
 (10, 0.5062023856012858),
 (35, 0.6098192948782316),
 (39, 0.6098192948782316),
 (45, 0.31622776601683794),
 (26, 0.31622776601683794),
 (0, 0.31622776601683794),
 (38, 0.31622776601683794),
 (20, 0.31622776601683794),
 (36, 0.31622776601683794),
 (12, 0.31622776601683794),
 (42, 0.31622776601683794),
 (13, 0.31622776601683794),
 (3, 0.31622776601683794),
 (8, 0.32095270940344806),
 (14, 0.386650005027498),
 (37, 0.386650005027498),
 (44, 0.386650005027498),
 (32, 0.386650005027498),
 (4, 0.386650005027498),
 (17, 0.386650005027498),
 (5

In [85]:
fully_indexed

[{'hi': 0.3606316736641664,
  'like': 0.3606316736641664,
  'follow': 0.3606316736641664,
  'project': 0.3606316736641664,
  'proposal': 0.3606316736641664,
  '5pm': 0.3606316736641664,
  'later': 0.3606316736641664,
  'email': 0.29935525993587564},
 {'raining': 0.3392454547140992,
  'metro': 0.3392454547140992,
  'manila': 0.3392454547140992,
  'need': 0.3392454547140992,
  'bring': 0.3392454547140992,
  'umbrellas': 0.3392454547140992,
  'll': 0.3392454547140992,
  'good': 0.2816028615738706,
  'prepared': 0.3392454547140992},
 {'good': 0.5062023856012858,
  'solemnly': 0.6098192948782316,
  'swear': 0.6098192948782316},
 {'won': 0.31622776601683794,
  'php': 0.31622776601683794,
  '100000000': 0.31622776601683794,
  'super': 0.31622776601683794,
  'lotto': 0.31622776601683794,
  'sss': 0.31622776601683794,
  'id': 0.31622776601683794,
  'tin': 0.31622776601683794,
  'information': 0.31622776601683794,
  'claim': 0.31622776601683794},
 {'email': 0.32095270940344806,
  'items': 0.3866

In [111]:
somelist = []
l = "my string is nice".split()
for x in l:
    somelist.append({l.index(x) : x})

In [112]:
somelist

[{0: 'my'}, {1: 'string'}, {2: 'is'}, {3: 'nice'}]

In [None]:
for email in email_list:
    email_dict_list = {"subject" : email.sub(), "body": email.body(), "status" : email.status()}