In [55]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re
## for Bigram generator
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
## for Trigram generator
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures

In [39]:
df = pd.read_csv('emails.csv')

In [40]:
df_bigram = df[:5]
df_bigram

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [41]:
df_bigram.drop(columns = 'spam', inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [42]:
def clean_email(text):
    text = text.lower()
    text = text.replace("subject:", '')
    text = re.sub('[^\sa-zA-Z]', '', text)
    return text

In [43]:
df_bigram['text1'] = df_bigram['text'].apply(clean_email)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [44]:
df_bigram

Unnamed: 0,text,text1
0,Subject: naturally irresistible your corporate...,naturally irresistible your corporate identit...
1,Subject: the stock trading gunslinger fanny i...,the stock trading gunslinger fanny is merril...
2,Subject: unbelievable new homes made easy im ...,unbelievable new homes made easy im wanting ...
3,Subject: 4 color printing special request add...,color printing special request additional i...
4,"Subject: do not have money , get software cds ...",do not have money get software cds from here...


In [45]:
def tokenize_email(text):
    token = word_tokenize(text)
    stop = stopwords.words('english')
    words = [word for word in token if word not in stop]
    punctuations = list(string.punctuation)
    words = [word for word in words if word not in punctuations]
    return words

In [46]:
df_bigram['tokenized'] = df_bigram['text1'].apply(tokenize_email)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [47]:
df_bigram

Unnamed: 0,text,text1,tokenized
0,Subject: naturally irresistible your corporate...,naturally irresistible your corporate identit...,"[naturally, irresistible, corporate, identity,..."
1,Subject: the stock trading gunslinger fanny i...,the stock trading gunslinger fanny is merril...,"[stock, trading, gunslinger, fanny, merrill, m..."
2,Subject: unbelievable new homes made easy im ...,unbelievable new homes made easy im wanting ...,"[unbelievable, new, homes, made, easy, im, wan..."
3,Subject: 4 color printing special request add...,color printing special request additional i...,"[color, printing, special, request, additional..."
4,"Subject: do not have money , get software cds ...",do not have money get software cds from here...,"[money, get, software, cds, software, compatib..."


In [53]:
##words1 = ['naturally', 'irresistible', 'cute', 'chocolate', 'tasty', 'food']
def bigram_generator(text):
    bigram_collocation = BigramCollocationFinder.from_words(text)
    bigramlist = bigram_collocation.nbest(BigramAssocMeasures.likelihood_ratio, 10)
    return bigramlist    

In [54]:
df_bigram['bigram'] = df_bigram['tokenized'].apply(bigram_generator)
df_bigram

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,text,text1,tokenized,bigram
0,Subject: naturally irresistible your corporate...,naturally irresistible your corporate identit...,"[naturally, irresistible, corporate, identity,...","[(aim, hotat), (amount, changes), (benefits, c..."
1,Subject: the stock trading gunslinger fanny i...,the stock trading gunslinger fanny is merril...,"[stock, trading, gunslinger, fanny, merrill, m...","[(attainder, penultimate), (bedtime, nameable)..."
2,Subject: unbelievable new homes made easy im ...,unbelievable new homes made easy im wanting ...,"[unbelievable, new, homes, made, easy, im, wan...","[(advantage, limited), (approval, form), (appr..."
3,Subject: 4 color printing special request add...,color printing special request additional i...,"[color, printing, special, request, additional...","[(additional, information), (e, mail), (form, ..."
4,"Subject: do not have money , get software cds ...",do not have money get software cds from here...,"[money, get, software, cds, software, compatib...","[(along, best), (best, yet), (comedies, ended)..."


In [56]:
def trigram_generator(text):
    trigram_collocation = TrigramCollocationFinder.from_words(text)
    # Top 10 most occurring collocations
    trigramlist = trigram_collocation.nbest(TrigramAssocMeasures.likelihood_ratio, 10)
    return trigramlist

In [57]:
df_bigram['trigram'] = df_bigram['tokenized'].apply(trigram_generator)
df_bigram

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,text,text1,tokenized,bigram,trigram
0,Subject: naturally irresistible your corporate...,naturally irresistible your corporate identit...,"[naturally, irresistible, corporate, identity,...","[(aim, hotat), (amount, changes), (benefits, c...","[(aim, hotat, nowadays), (amount, changes, ext..."
1,Subject: the stock trading gunslinger fanny i...,the stock trading gunslinger fanny is merril...,"[stock, trading, gunslinger, fanny, merrill, m...","[(attainder, penultimate), (bedtime, nameable)...","[(bedtime, nameable, attire), (boar, duane, pl..."
2,Subject: unbelievable new homes made easy im ...,unbelievable new homes made easy im wanting ...,"[unbelievable, new, homes, made, easy, im, wan...","[(advantage, limited), (approval, form), (appr...","[(advantage, limited, time), (approval, form, ..."
3,Subject: 4 color printing special request add...,color printing special request additional i...,"[color, printing, special, request, additional...","[(additional, information), (e, mail), (form, ...","[(form, pdf, format), (order, form, pdf), (pri..."
4,"Subject: do not have money , get software cds ...",do not have money get software cds from here...,"[money, get, software, cds, software, compatib...","[(along, best), (best, yet), (comedies, ended)...","[(along, best, yet), (best, yet, tradgedies), ..."


In [58]:
## copy to excel
df_bigram.to_excel('FinaloutputNLP.xlsx')