# Spam Classification

The objective of it is to generate a model capable to classify a email as spam or not spam.

The dataset used was from http://www2.aueb.gr/users/ion/data/enron-spam/

## Imports

In [1]:
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
import string
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from textblob import TextBlob
import email
from email.message import EmailMessage
from email.parser import BytesParser, Parser
from email.policy import default
import ast
import re

[nltk_data] Downloading package stopwords to /Users/phrc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/phrc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Functions

In [None]:
def createDf(path):
    """
    Read all email files and convert to a dataframe

    Parameters
    ----------
    path: str
        Path to a root directory to be read

    Returns
    ----------
    dataframe
        a dataframe with x columns based in all emails properties
    """
    os.chdir(path)
    data = []
    for root, dirs, files in os.walk(path):
         for file in files:
            with open(os.path.join(root, file), "rb") as openFile:
                dictTemp = {}
                try:
                    message = email.message_from_binary_file(openFile, policy=default)
                    #Parser(policy=default).parsestr(openFile.read())
                    dictTemp['file'] = openFile.name
                    for key in message.keys():
                        dictTemp[key.lower()] = message[key]
                    dictTemp['messageType'] = message.get_content_type()
                    body = message.get_body()
                    if body['content-type'].maintype == 'text':
                        if body['content-type'].subtype == 'plain':
                            dictTemp['messageStr'] = str(body.get_content())
                        elif body['content-type'].subtype == 'html':
                            dictTemp['messageStr'] = str(body)
                    elif body['content-type'].content_type in 'multipart':
                        dictTemp['messageStr'] = str(body.get_body(preferencelist=('html')))
                    dictTemp['parseError'] = False                 
                except:
                    dictTemp['parseError'] = True
                data.append(dictTemp)
    df = pd.DataFrame(data)
    return df

def emailTextCleanner(text):
    """
    Remove:
        Html Tags
        Email headers
        Ponctuation
        break lines and tabs
    
    And convert the string to lower case 

    Parameters
    ----------
    text: str

    Returns
    ----------
    str
    """
    try:
        text = str(text).lower()
        clean = re.compile('charset.*\n')
        text = re.sub(clean, '', text)
        clean = re.compile('content-.*\n')
        text = re.sub(clean, '', text)
        clean = re.compile('received: from.*\n')
        text = re.sub(clean, '', text)
        text = text.replace('\n', ' ')
        text = text.replace('\t', ' ')
        clean = re.compile('<.*?>')
        text = re.sub(clean, '', str(text))
        clean = re.compile('['+string.punctuation+']')
        text = re.sub(clean, '', text)
        
        return text
    except:
        print(type(text))
        return ""
#    text = text.replace('Subject:', '', 1)
    #print(text)
    #print("---------------------------------------------")
    #print("---------------------------------------------")
    #text = TextBlob(text)
    #text = str(text.correct())
    #print(text)
    #print("\n")
#    return text

def lemmatizeList(words):
    """
    Lemmatize a list of string

    Parameters
    ----------
    words: list of str

    Returns
    ----------
    list of str
    """
#    print("|", end='')
    lem = WordNetLemmatizer()
    for i in range(len(words)):
#        text = TextBlob(words[i])
#        words[i] = str(text.correct())
        words[i] = lem.lemmatize(words[i], 'v')
        words[i] = lem.lemmatize(words[i], 'n')
    return words

def revomeWordsWithOneCharacter(words):
    return list(filter(lambda x : len(x) > 1, words))

def removeDigits(words):
    return list(filter(lambda x : x.isdigit() == False, words))

## Dataset generation

In [None]:
spamPath = '/Users/phrc/Documents/Projects/pythonProject/SpamEmailClassifier/emails/spam/'
hamPath = '/Users/phrc/Documents/Projects/pythonProject/SpamEmailClassifier/emails/ham/'


dfSpam = createDf(spamPath)
dfHam = createDf(hamPath)

## Dataset Size

In [None]:

plt.bar(np.arange(2), (len(dfHam), len(dfSpam)), align='center')
plt.xticks(np.arange(2), ('Ham \n{}'.format(len(dfHam)), 'Spam \n{}'.format(len(dfSpam))))

plt.show();

In [None]:
dfSpam.head()

### Dataset Spam Columns Analyse

In [None]:
dfSpamDesc = dfSpam.describe(include=['object']).T
n = len(dfSpam) * 0.75 
dfSpamDesc[dfSpamDesc['count'] > n].head(300)

### Dataset Ham Columns Analyse

In [None]:
dfHamDesc = dfHam.describe(include=['object']).T
n = len(dfHam) * 0.75 
dfHamDesc[dfHamDesc['count']> 15000].head(300)

### Removing unnescessary columns

In [None]:
dfSpam = dfSpam[['content-type', 'date', 'from', 'messageStr', 'messageType', 'subject', 'to', 'parseError', 'file']]
dfHam = dfHam[['content-type', 'date', 'from', 'messageStr', 'messageType', 'subject', 'to', 'parseError', 'file']]


### Emails with Parser Error 

In [None]:
dfHamP = dfHam[dfHam['parseError'] == True]
dfSpamP = dfSpam[dfSpam['parseError'] == True]
hamFreq = (len(dfHamP) * 100 / len(dfHam))
spamFreq = (len(dfSpamP) * 100 / len(dfSpam))
plt.bar(np.arange(2), (hamFreq, spamFreq), align='center')
plt.xticks(np.arange(2), ('Ham \n{0:.2}%'.format(hamFreq), 'Spam \n{0:.2}%'.format(spamFreq)))

plt.show();

### Removing emails with parser problems 

In [None]:
dfSpam = dfSpam[dfSpam['parseError'] == False]
dfHam = dfHam[dfHam['parseError'] == False]

del dfHam['parseError']
del dfSpam['parseError']

### Join datasets

In [None]:
dfSpam['isSpam'] = True
dfHam['isSpam'] = False

print(dfHam.columns.values)
print(dfSpam.columns.values)

dfMaster = pd.concat([dfSpam, dfHam])
print(len(dfMaster))
dfMaster.head()

### Check email message and email type 

In [None]:
fig = plt.figure(figsize = (16,12))
grid = plt.GridSpec(2,4, wspace =0.3, hspace =0.5)
barAll = fig.add_subplot(grid[0,0:3])
barNonNull = fig.add_subplot(grid[1,0:3])
barNull = fig.add_subplot(grid[0:2,3])

barAll.bar(
    dfMaster['messageType'].unique(), 
    dfMaster['messageType'].value_counts(), 
    align='center'
)
barAll.set_title('All emails')

barNonNull.bar(
    dfMaster[dfMaster['messageStr'].notnull()]['messageType'].unique(), 
    dfMaster[dfMaster['messageStr'].notnull()]['messageType'].value_counts(), 
    align='center'
)

barNonNull.set_title('Non null meassages')

barNull.bar(
    dfMaster[dfMaster['messageStr'].isnull()]['messageType'].unique(), 
    dfMaster[dfMaster['messageStr'].isnull()]['messageType'].value_counts(), 
    align='center'
)

barNull.set_title('Null meassages')

fig.text(0.5, 0.04, 'Email Type', ha='center', fontsize=15)
fig.text(0.04, 0.5, 'Emails', va='center', rotation='vertical', fontsize=15)

plt.show()

### Null messages proportions

In [None]:
plt.pie([len(dfMaster[dfMaster['messageStr'].notnull()]), len(dfMaster[dfMaster['messageStr'].isnull()])], 
        labels=['Non Null', 'Null'], autopct='%1.0f%%', pctdistance=0.5, labeldistance=1.2)


### Remove null messages 

In [None]:
dfMaster = dfMaster[dfMaster['messageStr'].notnull()]
print(len(dfMaster[dfMaster['messageStr'].isnull()]))

### Remove HTML tags from the messages 

In [None]:
dfMaster['treatedMessage'] = dfMaster['messageStr'].apply(emailTextCleanner)

In [None]:
dfMaster.describe(include=['object']).T.head(100)

In [None]:
dfTemp = dfMaster[dfMaster['messageType'].str.contains('multipart')]
len(dfTemp)


In [None]:
dfTemp.head(30)

## Preparing the dataset

In [None]:
#Remove Stop Words
stop = text.ENGLISH_STOP_WORDS
pat = r'\b(?:{})\b'.format('|'.join(stop))
dfMaster['treatedMessage'] = dfMaster['treatedMessage'].str.replace(pat, '')

#Create a column with list of words
dfMaster['wordsList'] = dfMaster['treatedMessage'].str.split().apply(lemmatizeList).apply(revomeWordsWithOneCharacter).apply(removeDigits)



#Create a columns to calculate the total amount of words
dfMaster['totalTreatedWords'] = dfMaster['wordsList'].apply(lambda x : len(x))
dfMaster['treatedTextLen'] = dfMaster['treatedMessage'].apply(lambda x : len(x))
dfMaster['textLen'] = dfMaster['messageStr'].apply(lambda x : len(str(x)))
dfMaster['uniqueWordsLen'] = dfMaster['wordsList'].apply(lambda x : len(set(x)))

dfMaster.head()

## Visualisation

### Dataset distribution by spam type

In [None]:
df = dfMaster

print(df['isSpam'].value_counts().rename({False: 'Ham', True: 'Spam'}))


fig = plt.figure(figsize = (18,6))
grid = plt.GridSpec(1,2, wspace =0.1)
barPlt = fig.add_subplot(grid[0,0])
piePlt = fig.add_subplot(grid[0,1])


barPlt.bar(
    ('Spam', 'Ham'), 
    [len(df[df['isSpam'] == True]), len(df[df['isSpam'] == False])], 
    align='center')
barPlt.set_xticks(np.arange(2), ['Spam', 'Ham'])


piePlt.pie(df['isSpam'].value_counts(), labels=['Spam', 'Ham'], autopct='%1.0f%%', pctdistance=0.5, labeldistance=1.2)


plt.show()

### Compare text lenght for ham and spam  

There is 4 differents attributes what define the text lengh:

- textLen : This is the raw text without any treatment
- treatedTextLen: This is the text after removing the stop words and ponctuation
- totalTreatedWords: This is the amount of words used in the text after the treatment, basically this exclude space and line breaks
- uniqueWordsLen: This is the amount of unique words used in the treated text, basically it removes repited words

### Raw Text size lengh comparison  

In [None]:
fig = plt.figure(figsize = (18,6))
grid = plt.GridSpec(1,4, wspace =0.8)
histPlt = fig.add_subplot(grid[0,0:2])
boxPlt1 = fig.add_subplot(grid[0,2])
boxPlt2 = fig.add_subplot(grid[0,3])

histPlt.hist(
    [
        df[df['isSpam'] == True]['textLen'], 
        df[df['isSpam'] == False]['textLen']
    ], 
    np.linspace(0, df['textLen'].quantile(0.75), 30), 
    density = True, 
    label=['Spam', 'Ham']
) 
histPlt.legend(loc='upper right')

boxPlt1.boxplot(
    (
         df[df['isSpam'] == True]['textLen'], 
         df[df['isSpam'] == False]['textLen'] 
    ), 
    labels = ('Spam', 'Ham')
)

boxPlt2.boxplot(
    (
        df[(df['isSpam'] == True) & (df['textLen'] < df['textLen'].quantile(0.75))]['textLen'], 
        df[(df['isSpam'] == False) & (df['textLen'] < df['textLen'].quantile(0.75))]['textLen'] 
    ), 
    labels = ('Spam', 'Ham')
)

plt.show()

### Text size without stop words and punctuation 

In [None]:
fig = plt.figure(figsize = (18,6))
grid = plt.GridSpec(1,4, wspace =0.8)
histPlt = fig.add_subplot(grid[0,0:2])
boxPlt1 = fig.add_subplot(grid[0,2])
boxPlt2 = fig.add_subplot(grid[0,3])

histPlt.hist(
    [
        df[df['isSpam'] == True]['treatedTextLen'], 
        df[df['isSpam'] == False]['treatedTextLen']
    ],  
    np.linspace(0, df['treatedTextLen'].quantile(0.75), 30),
    density= True,
    label=['Spam', 'Ham']
) 
histPlt.legend(loc='upper right')

boxPlt1.boxplot(
    (
        df[df['isSpam'] == True]['treatedTextLen'], 
        df[df['isSpam'] == False]['treatedTextLen'] ), 
    labels = ('Spam', 'Ham')
)

boxPlt2.boxplot(
    (
        df[(df['isSpam'] == True) & (df['treatedTextLen'] < df['treatedTextLen'].quantile(0.75))]['treatedTextLen'], 
        df[(df['isSpam'] == False) & (df['treatedTextLen'] < df['treatedTextLen'].quantile(0.75))]['treatedTextLen'] 
    ), 
    labels = ('Spam', 'Ham')
)

plt.show()



### Total words used

In [None]:
fig = plt.figure(figsize = (18,6))
grid = plt.GridSpec(1,4, wspace =0.8)
histPlt = fig.add_subplot(grid[0,0:2])
boxPlt1 = fig.add_subplot(grid[0,2])
boxPlt2 = fig.add_subplot(grid[0,3])

histPlt.hist(
    [
        df[df['isSpam'] == True]['totalTreatedWords'], 
        df[df['isSpam'] == False]['totalTreatedWords']
    ],  
    np.linspace(0, df['totalTreatedWords'].quantile(0.75), 30),
    density= True,
    label=['Spam', 'Ham']
) 
histPlt.legend(loc='upper right')

boxPlt1.boxplot(
    (
        df[df['isSpam'] == True]['totalTreatedWords'], 
        df[df['isSpam'] == False]['totalTreatedWords'] 
    ), 
    labels = ('Spam', 'Ham')
)

boxPlt2.boxplot(
    (
        df[(df['isSpam'] == True) & (df['totalTreatedWords'] < df['totalTreatedWords'].quantile(0.75))]['totalTreatedWords'], 
        df[(df['isSpam'] == False) & (df['totalTreatedWords'] < df['totalTreatedWords'].quantile(0.75))]['totalTreatedWords'] 
    ), 
    labels = ('Spam', 'Ham')
)

plt.show()


### Total unique words used

In [None]:
fig = plt.figure(figsize = (18,6))
grid = plt.GridSpec(1,4, wspace =0.8)
histPlt = fig.add_subplot(grid[0,0:2])
boxPlt1 = fig.add_subplot(grid[0,2])
boxPlt2 = fig.add_subplot(grid[0,3])

histPlt.hist(
    [
        df[df['isSpam'] == True]['uniqueWordsLen'], 
        df[df['isSpam'] == False]['uniqueWordsLen']
    ],  
    np.linspace(0, df['uniqueWordsLen'].quantile(0.75), 30),
    density= True,
    label=['Spam', 'Ham']
) 
histPlt.legend(loc='upper right')

boxPlt1.boxplot(
    (
        df[df['isSpam'] == True]['uniqueWordsLen'], 
        df[df['isSpam'] == False]['uniqueWordsLen'] 
    ), 
    labels = ('Spam', 'Ham')
)

boxPlt2.boxplot(
    (
        df[(df['isSpam'] == True) & (df['uniqueWordsLen'] < df['uniqueWordsLen'].quantile(0.75))]['uniqueWordsLen'], 
        df[(df['isSpam'] == False) & (df['uniqueWordsLen'] < df['uniqueWordsLen'].quantile(0.75))]['uniqueWordsLen'] 
    ), 
    labels = ('Spam', 'Ham')
)

plt.show()


### Conclusion

# Need to do 

----It's not clear the text size can influence into the classification of the email in spam or ham.



## Words Visualization

### Total uniques words

In [None]:
count = Counter(list(chain(*df["wordsList"].values)))

countSpam = Counter(list(chain(*df[df['isSpam'] == True]["wordsList"].values)))

countHam = Counter(list(chain(*df[df['isSpam'] == False]["wordsList"].values)))

fig = plt.figure(figsize = (18,6))
grid = plt.GridSpec(1,2, wspace =0.2)
pltBar = fig.add_subplot(grid[0,0])
pltText = fig.add_subplot(grid[0,1])


pltBar.bar(
    ['Total unique words', 'Spam unique words', 'Ham unique words'], 
    [len(count), len(countSpam), len(countHam)], 
    align='center'
)

textWords = ['Total unique words:              {}'.format(len(count)), 
             'Total spam unique words:    {}'.format(len(countSpam)), 
             'Total ham unique words:      {}'.format(len(countHam))]


pltText.text(x=0, y=0.5, s = '\n'.join(textWords), fontsize = 18) 
pltText.axis('off')
plt.show()

In [None]:
countSpam

### 20 Most common words

In [None]:
dfWords = pd.DataFrame(list(dict(count).items()))
dfWords.columns = ['word', 'occur'] 
totalOcurr = dfWords['occur'].sum()
dfWords['freq'] = dfWords['occur'] / totalOcurr  
dfWords = dfWords.sort_values(by='freq', ascending=False)
dfWords = dfWords.reset_index(drop=True)
dfWords['freqAcum'] = dfWords['freq'].cumsum()


fig = plt.figure(figsize = (18,12))
grid = plt.GridSpec(4,2, wspace =0.2, hspace = 0.5)
pltLine = fig.add_subplot(grid[0:2,0])
pltBar = fig.add_subplot(grid[2:4,0:2])
pltTable = fig.add_subplot(grid[0:2,1])

pltLine.plot(dfWords['freqAcum'], range(len(dfWords)))
pltLine.set_ylabel('Number of Words')
pltLine.set_xlabel('Acumulative Frequence')
pltLine.set_title('Line Graph of acumulative frequence')
pltLine.grid(True)


dfTWord = dfWords.loc[[0,20,50,100,500,1000,5000,10000,15000]][['freqAcum']]

pltTable.table(cellText=dfTWord.values, rowLabels= dfTWord.index, colLabels = dfTWord.columns, loc='best')
pltTable.axis('off')
pltTable.set_title('Table of acumulative frequence')



pltBar.bar(dfWords.head(20)['word'], dfWords.head(20)['freq'])
pltBar.set_title('20 Most Used word')

yPos = np.arange(20)
pltBar.set_xticklabels(dfWords.head(20)["word"], rotation=60)
pltBar.set_xticks(yPos)


plt.show()



### 20 Most common words on Spam emails

In [None]:
dfSpamWords = pd.DataFrame(list(dict(countSpam).items()))
dfSpamWords.columns = ['word', 'occur'] 
totalSpamOcurr = dfSpamWords['occur'].sum()
dfSpamWords['freq'] = dfSpamWords['occur'] / totalSpamOcurr  
dfSpamWords = dfSpamWords.sort_values(by='freq', ascending=False)
dfSpamWords = dfSpamWords.reset_index(drop=True)
dfSpamWords['freqAcum'] = dfSpamWords['freq'].cumsum()


fig = plt.figure(figsize = (18,12))
grid = plt.GridSpec(4,2, wspace =0.2, hspace = 0.5)
pltLine = fig.add_subplot(grid[0:2,0])
pltBar = fig.add_subplot(grid[2:4,0:2])
pltTable = fig.add_subplot(grid[0:2,1])

pltLine.plot(dfSpamWords['freqAcum'], range(len(dfSpamWords)))
pltLine.set_ylabel('Number of Words')
pltLine.set_xlabel('Acumulative Frequence')
pltLine.set_title('Line Graph of acumulative frequence')
pltLine.grid(True)


dfSWord = dfSpamWords.loc[[0,20,50,100,500,1000,5000,10000,15000]][['freqAcum']]

pltTable.table(cellText=dfSWord.values, rowLabels= dfSWord.index, colLabels = dfSWord.columns, loc='best')
pltTable.axis('off')
pltTable.set_title('Table of acumulative frequence')



pltBar.bar(dfSpamWords.head(20)['word'], dfSpamWords.head(20)['freq'], align='center')
pltBar.set_title('20 Most Used word')

yPos = np.arange(20)
pltBar.set_xticklabels(dfSpamWords.head(20)["word"], rotation=60)
pltBar.set_xticks(yPos)

plt.show()



### 20 Most common words on Ham emails

In [None]:
dfHamWords = pd.DataFrame(list(dict(countHam).items()))
dfHamWords.columns = ['word', 'occur'] 
totalHamOcurr = dfHamWords['occur'].sum()
dfHamWords['freq'] = dfHamWords['occur'] / totalHamOcurr  
dfHamWords = dfHamWords.sort_values(by='freq', ascending=False)
dfHamWords = dfHamWords.reset_index(drop=True)
dfHamWords['freqAcum'] = dfHamWords['freq'].cumsum()


fig = plt.figure(figsize = (18,12))
grid = plt.GridSpec(4,2, wspace =0.2, hspace = 0.5)
pltLine = fig.add_subplot(grid[0:2,0])
pltBar = fig.add_subplot(grid[2:4,0:2])
pltTable = fig.add_subplot(grid[0:2,1])

pltLine.plot(dfHamWords['freqAcum'], range(len(dfHamWords)))
pltLine.set_ylabel('Number of Words')
pltLine.set_xlabel('Acumulative Frequence')
pltLine.set_title('Line Graph of acumulative frequence')
pltLine.grid(True)


dfHWord = dfHamWords.loc[[0,20,50,100,500,1000,5000,10000,15000]][['freqAcum']]

pltTable.table(cellText=dfHWord.values, rowLabels= dfHWord.index, colLabels = dfHWord.columns, loc='best')
pltTable.axis('off')
pltTable.set_title('Table of acumulative frequence')



pltBar.bar(dfHamWords.head(20)['word'], dfHamWords.head(20)['freq'], align='center')
pltBar.set_title('20 Most Used word')

yPos = np.arange(20)
pltBar.set_xticklabels(dfHamWords.head(20)["word"], rotation=60)
pltBar.set_xticks(yPos)

plt.show()



#### Word List

In [None]:
tempSet = sorted(set(list(chain(*df[df['isSpam'] == True]["wordsList"].values))))


print(tempSet)

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()

# find those words that may be misspelled
misspelled = spell.unknown(tempSet)

print(misspelled)

In [None]:
lem = WordNetLemmatizer()
words = lem.lemmatize('accommodations')

words

## Modeling

### Spliting prepare data set for modeling 

In [None]:
l = list(chain(*df["wordsList"].values))

l = set(filter(lambda k : len(k) == 1, l))

l

In [None]:
dfWordsG = pd.DataFrame(list(dict(count).items()))
dfWordsG.columns = ['word', 'occur'] 
totalOcurrG = dfWordsG['occur'].sum()
dfWordsG['freq'] = dfWordsG['occur'] / totalOcurrG  
dfWordsG = dfWordsG.sort_values(by='freq', ascending=False)
dfWordsG = dfWordsG.reset_index(drop=True)
dfWordsG['freqAcum'] = dfWordsG['freq'].cumsum()

dfSpamWordsG = pd.DataFrame(list(dict(countSpam).items()))
dfSpamWordsG.columns = ['word', 'occur'] 
totalSpamOcurrG = dfSpamWordsG['occur'].sum()
dfSpamWordsG['freq'] = dfSpamWordsG['occur'] / totalOcurrG  
dfSpamWordsG = dfSpamWordsG.sort_values(by='freq', ascending=False)
dfSpamWordsG = dfSpamWordsG.reset_index(drop=True)
dfSpamWordsG['freqAcum'] = dfSpamWordsG['freq'].cumsum()

dfHamWordsG = pd.DataFrame(list(dict(countHam).items()))
dfHamWordsG.columns = ['word', 'occur'] 
totalHamOcurrG = dfHamWordsG['occur'].sum()
dfHamWordsG['freq'] = dfHamWordsG['occur'] / totalOcurrG  
dfHamWordsG = dfHamWordsG.sort_values(by='freq', ascending=False)
dfHamWordsG = dfHamWordsG.reset_index(drop=True)
dfHamWordsG['freqAcum'] = dfHamWordsG['freq'].cumsum()

fig = plt.figure(figsize = (18,6))
grid = plt.GridSpec(1,3, wspace =0.2)
pltTotal = fig.add_subplot(grid[0,0])
pltSpam = fig.add_subplot(grid[0,1])
pltHam = fig.add_subplot(grid[0,2])

pltTotal.plot(dfWords['freqAcum'], range(len(dfWordsG)))
pltTotal.set_ylabel('Total Words')
pltTotal.set_xlabel('Acumulative Frequence')
pltTotal.set_title('All Words')
pltTotal.grid(True)

pltSpam.plot(dfSpamWords['freqAcum'], range(len(dfSpamWordsG)))
pltSpam.set_ylabel('Total Spam Words')
pltSpam.set_xlabel('Acumulative Frequence')
pltSpam.set_title('Spam Words')
pltSpam.grid(True)

pltHam.plot(dfHamWords['freqAcum'], range(len(dfHamWordsG)))
pltHam.set_ylabel('Total Ham Words')
pltHam.set_xlabel('Acumulative Frequence')
pltHam.set_title('Ham Words')
pltHam.grid(True)


fig2 = plt.figure(figsize = (18,6))
grid2 = plt.GridSpec(1,6, wspace =0.4)

pltTotalTableG = fig2.add_subplot(grid2[0,0:2])
pltSpamTableG = fig2.add_subplot(grid2[0,2:4])
pltHamTableG = fig2.add_subplot(grid2[0,4:6])

dfTWordG = dfWordsG.loc[[0,50,100,500,1000,5000,10000,15000]][['freqAcum']]

pltTotalTableG.table(cellText=dfTWordG.values, rowLabels= dfTWordG.index, colLabels = dfTWordG.columns, loc='best')
pltTotalTableG.axis('off')

dfSWordG = dfSpamWordsG.loc[[0,50,100,500,1000,5000,10000,15000]][['freqAcum']]

pltSpamTableG.table(cellText=dfSWordG.values, rowLabels= dfSWordG.index, colLabels = dfSWordG.columns, loc='best')
pltSpamTableG.axis('off')

dfHWordG = dfHamWordsG.loc[[0,50,100,500,1000,5000,10000,15000]][['freqAcum']]

pltHamTableG.table(cellText=dfHWordG.values, rowLabels= dfHWordG.index, colLabels = dfHWordG.columns, loc='best')
pltHamTableG.axis('off')


plt.show()
