In [202]:
# -*- coding: utf-8 -*-
import pandas as pd # Handle dataframes 
import re           # Handle regular expressions
import numpy as np  # Arrange arrays
import plotly.graph_objs as go # Create charts
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot # Create offline plotly charts
init_notebook_mode(connected=True)
import spacy # NLP functions
nlp = spacy.load('en_core_web_sm')
%matplotlib inline
import time
from sklearn.externals import joblib

### Objective
The objective is to identify complex signature blocks in plaintext email messages using sequential representation of an email message.

People don't follow a standard e-mail layout. Emails have different signature formats, some with confidentiality phrases, others with cell phone number and other informations.    
The ideia is to isolate the body so we can analyze it without the influence of email header and signature.
The project is based on https://www.cs.cmu.edu/~wcohen/postscript/email-2004.pdf paper

Some examples:

### Results Summary
| Method | Classes | Sequence Model | Accuracy |
| --- | --- | --- | --- |
| **RandomForest Classifier** | **Multi-classification** | **prev/next lines** | **94.2%** |
| RandomForest Classifier | Multi-classification | current line | 91% |
| MLP Neural Network | Multi-classification | prev/next lines | 90% |
| **RandomForest Classifier** | **Binary classification** | **prev/next lines** | **94.6%** |
| RandomForest Classifier | Binary classification | current line | 91.6% |
| MLP Neural Network | Multi-classification | prev/next lines | 92.2% |
| MLP Neural Network | Binary classification | prev/next lines | 92.4% |

In [182]:
display(gp)

Unnamed: 0,callrec_u_email,emt_description,label
0,testCall,"from: junior, fer",h
1,testCall,"sent: wednesday, july 19, 2017 5:41 am",h
2,testCall,to: abcdefghij@abcde.com,h
3,testCall,subject: re: email account,h
4,testCall,hello,b
7,testCall,no opinions answered oh felicity is resolved hastened.,b
8,testCall,produced it friendly my if opinions humoured.,b
9,testCall,enjoy is wrong folly no taken. it sufficient instrument insipidity simplicity at interested. law pleasure attended differed mrs fat and formerly. merely thrown garret her law danger him son better excuse.,b
10,testCall,effect extent narrow in up chatty.,b
11,testCall,small are his chief offer happy had.,b


### Example of email complexity
As you can see, we have e-mail addresses embedded in the body, we have a long signature with many messages such as role title, phone number, links and fixed messages.

In [87]:
df = pd.read_csv('datasets/3kTrain.csv', encoding = "latin-1")

#### Classes distribution
Dataset was manually labeled in 3 classes: 
- h: header
- b: body
- f: footer

Contains **248 emails** and **5518** lines

In [88]:
display(df.describe())

dfClasses = df.groupby('label').size()
display(dfClasses)
iplot([go.Bar(x=dfClasses.index, y=dfClasses.values)])

Unnamed: 0,callrec_u_email,emt_description,label
count,6025,6025,6025
unique,273,4364,3
top,EMT0095592,CONFIDENTIALITY. This email and any attachment...,b
freq,192,57,2698


label
b    2698
f    2281
h    1046
dtype: int64

In [124]:
strLength = []
for el, gp in df.groupby('callrec_u_email'):
    hSrtLength = gp.loc[gp['label'] == 'h']['emt_description'].str.len().sum()
    bSrtLength = gp.loc[gp['label'] == 'b']['emt_description'].str.len().sum()
    fSrtLength = gp.loc[gp['label'] == 'f']['emt_description'].str.len().sum()   
    strLength.append({"callrec_u_email": el, "hSrtLength":hSrtLength, "bSrtLength":bSrtLength, "fSrtLength":fSrtLength})
display(pd.DataFrame(strLength).describe())    



# .apply(lambda x: np.sum(x.str.len())).reset_index(name='mean_len_text')
# display(dfMessageLenth)

Unnamed: 0,bSrtLength,fSrtLength,hSrtLength
count,273.0,273.0,273.0
mean,594.384615,412.179487,269.641026
std,798.275233,625.04427,758.371564
min,0.0,0.0,0.0
25%,154.0,45.0,0.0
50%,328.0,246.0,0.0
75%,719.0,480.0,225.0
max,6641.0,5624.0,7159.0


#### Messages analysis
We have 248 e-mails in which 75% of them have 28 messages.

In [5]:
display(df.groupby('callrec_u_email').count().describe())
dfMessages = df.groupby('callrec_u_email').size()
layout = go.Layout(title=go.layout.Title(text='Number of lines of messages per E-mail'))
fig = go.Figure(data=[go.Bar(x=dfMessages.index, y=dfMessages.values)], layout=layout)
iplot(fig)

Unnamed: 0,emt_description,label
count,248.0,248.0
mean,22.25,22.25
std,24.562941,24.562941
min,1.0,1.0
25%,8.0,8.0
50%,14.0,14.0
75%,28.0,28.0
max,192.0,192.0


### How email parts are distributed?
It's clear that most of the time we have more footer and header messages than body messages. It means we have a lot of irrelevant information.

For classification and meaning understanding we need to filter out these irrelavant information

In [6]:
valuesDf = {}
for el, gp in df.groupby(['callrec_u_email', 'label']):
    if el[0] not in valuesDf:
        valuesDf[el[0]] = {"h":0, "b":0, "f":0, "callrec_u_email":el[0]}
    valuesDf[el[0]][el[1]] = gp.count()['emt_description']

valuesDf = pd.DataFrame(valuesDf).T
layout = go.Layout(title=go.layout.Title(text='Number of elements (Header, Body and Footer) per E-mail'), barmode='stack')
fig = go.Figure(data=[go.Bar(x=valuesDf.index, y=valuesDf['f'].values, name='Footer'),
                      go.Bar(x=valuesDf.index, y=valuesDf['b'].values, name='Body'),
                      go.Bar(x=valuesDf.index, y=valuesDf['h'].values, name='Header')], layout=layout)
iplot(fig)

### Cleaning each sentence
- Transforming string to lowercase
- Removing spaces at the begining and end of the string
- Remove sentences that start with "=", "[", "]"
- Remove sentences that has "-----" or "_____"
- Remove sentences that has just numbers
- Remove bullet points "Â"

In [156]:
def cleanUp(line):
#     line = line.lower()
    line = line.strip()

    if line == '':
        return None
    
    if line.startswith("="):
        return None
    
    if line == 'nan':
        return None
    
    if ('['or ']') in line:
        return None

    if re.match(r".*ref:+.*", line):
        return None
    
    if re.match(r"([-]){2,}", line):
        return None
    
    if re.match(r"([_]){2,}", line):
        return None
    
    if line.isdigit():
        return None
    
    line = line.replace(u"Â", u"")
    return line

### Receive a dataframe and split it into lines, it is useful for the training process.

In [8]:
def SplitEmailInRows(df):
    # Separate email in rows
    arrangedEmails = []
    for index, row in df.iterrows():
        idx = row['number']
        emailDesc = str(row['u_email.description'])

        # check for long ????????? which represents russian / chinese chars
        if len(re.findall(r"([?]){4,}", emailDesc)) > 0:
            continue

        splittedEmailDesc = emailDesc.splitlines()
        for val in splittedEmailDesc:
            cleaned = cleanUp(val)
            if cleaned is not None:
                arrangedEmails.append({"emt_description":cleaned, "callrec_u_email":idx})
    pd.DataFrame(arrangedEmails).to_csv('datasets/snow_04_06_19_splittedLines.csv')
# SplitEmailInRows(df)

### Set of Regex to help extracting features
This method can help to annotate the dataset as well.
The label has the e-mail part that the regex function corresponds to.

In [9]:
# get label
# header = h
# body   = b
# footer = f
def getRegexList():
    regexList = []
    regexList.append({"func": r".*\s*[a-z]+\s*[@]{1}.*", "label":"", "explanation": "email","found":0}) #Email
    regexList.append({"func": r".*\+\d+.*", "label": "f", "explanation": "Phone: +55 131231231","found":0}) #Phone
    regexList.append({"func": r".*\([0-9]{2,}\)\s*[0-9]+.*", "label": "f", "explanation": "Phone2: (123) 456789","found":0}) #Phone2: \([0-9]+\)\s*[0-9]+
    regexList.append({"func": r".*[0-9]+[-]{1}[0-9]+[-]{1}.*", "label": "f", "explanation": "Phone3: 123-456-789","found":0}) #Phone3: 123-456-789
    regexList.append({"func": r".*mars.*", "label": "", "explanation": "word mars","found":0}) # mars
    regexList.append({"func": r".*royal canin.*", "label": "", "explanation": "word royal canin","found":0}) # royal canin
    regexList.append({"func": r".*wrigley.*", "label": "", "explanation": "word wrigley","found":0}) # wrigley
    regexList.append({"func": "CONFIDENTIALITY. This email and any attachments are confidential and may also be privileged. If received in error, please do not disclose the contents to anyone, but notify the sender by return email and delete this email and any attachments from your system.".lower(), "label":"f", "explanation":"phrase9", "found":0})
    regexList.append({"func": "Please consider the environment - do you really need to print this mail? - Confidential Information - Mars, Inc.", "label": "f", "explanation": "phrase9","found":0}) 
    regexList.append({"func": "Sent from my iPhone".lower(), "label": "f", "explanation": "phrase9","found":0})
    regexList.append({"func": r".*accenture.com.*", "label": "f", "explanation": "phrase10","found":0})   
    regexList.append({"func": r"^[\W_]+$", "label":"f", "explanation": "Special char only", "found":0})
    regexList.append({"func": r"^([A-Za-z]+) ([A-Za-z]+)+[\w]*$", "label":"f", "explanation": "Person name", "found":0})
    regexList.append({"func": r".*https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "label": "", "explanation": "website link","found":0}) # link
    regexList.append({"func": r".*confidentiality.*", "label": "f", "explanation": "word confidentiality","found":0}) # confidentiality
    regexList.append({"func": r".*\|.*", "label": "f", "explanation": "pipe |","found":0}) # pipe |
    return regexList

In [10]:
def getLabel(sentence):
    regexList = getRegexList()
    sentence = sentence.lower()
    label = 'b'
    
    for idx, regx in enumerate(regexList):
        func = regx['func']
        if re.match(func, sentence):
            label = regx['label']
            if label is not None:
                break
    return label

### Feature Engineering
Let's create some features analyzing e-mail characteristics

- numOfCommas
- numOfWords
- numOfVerbs
- numOfSpecialChars
- footerTags
- headerTags
- ...

In [145]:
def getFeatures(sentence):
    departmentKeywords = ['division', 'operations', 'engineering', 'affairs', 'commercial' , 'purchasing', 'supply', 'finantial']
    roleKeywords = ['junior', 'manager', 'lead', 'director', 'senior', 'coordinator', 
                    'analyst', 'technician', 'assitant', 'partner', 'trainer', 'specialist', 'engineer']

    appreciation = ['thanks', 'thank you', 'regards', 'good day']
    greetings    = ['hi', 'hello', 'hey']
    formedFooterPhrases = ['please consider the environment - do you really need to print this mail? - confidential information - mars, inc.',
                           'confidentiality. this email and any attachments are confidential and may also be privileged. if received in error, please do not disclose the contents to anyone, but notify the sender',
                           'please consider the environment before printing any emails. - confidential information - mars, inc.'
                          ]
    footerTags = ['e:', 'o:', 'm:', 't:', 'c:','call:', 'f:','cell:', 'fax:', 'mobile:']
    headerTags = ['from:', 'to:', 'subject:', 'cc:', 'cc :', 'de:', 'de :', 'sent:', ]
    addressTags = ['suite', 'blvd', 'street', 'west', 'east', 'north', 'south']
    
    
    regList = []
    regList = getRegexList()
    for idx, regx in enumerate(regList):
        func = regx['func']
        if re.match(func, sentence.lower()):
            regx['found'] = 1
    
    
    numOfCapitalLetter = 0
    numOfCapitalLetter = sum(1 for c in sentence if c.isupper())
    regList.append({"func": "numOfCapitalLetter", "label": "", "explanation": "numOfCapitalLetter","found":numOfCapitalLetter})
    
    sentence = sentence.lower()
                             
    numOfWords = 0
    numOfWords = len(sentence.split())
    regList.append({"func": "numOfWords", "label": "", "explanation": "numOfWords","found":numOfWords})

    numOfChars = 0
    numOfChars = len(sentence)
    regList.append({"func": "numOfChars", "label": "", "explanation": "numOfChars","found":numOfChars})

    
    # Seems too correlated to numOfWords
    numSpaces = 0
    numSpaces = sentence.count(' ')
    regList.append({"func": "numSpaces", "label": "", "explanation": "numSpaces","found":numSpaces})

    numCommas = 0
    numCommas = sentence.count(',')
    regList.append({"func": "numCommas", "label": "", "explanation": "numCommas","found":numCommas})

    numNumbers = 0
    numNumbers = sum(c.isdigit() for c in sentence)
    regList.append({"func": "numNumbers", "label": "", "explanation": "numNumbers","found":numNumbers})

    questionMark = 0
    questionMark = sentence.count('?')
    regList.append({"func": "questionMark", "label": "", "explanation": "questionMark","found":questionMark})

    periodMark = 0
    periodMark = sentence.count('.')
    regList.append({"func": "periodMark", "label": "", "explanation": "periodMark","found":periodMark})

    numSpecialChars = 0
    numSpecialChars =  len(re.sub('[\w]*[" "]*[,]*[?]*[.]*[&]*' ,'', sentence))
    regList.append({"func": "numSpecialChars", "label": "", "explanation": "numSpecialChars","found":numSpecialChars})

    numAndSymbol = 0
    numAndSymbol = sentence.count('&')
    regList.append({"func": "numAndSymbol", "label": "", "explanation": "num of &s","found":numAndSymbol})
    
    numAndText = 0
    if any(char.isdigit() for char in sentence):
        numAndText = 1
    regList.append({"func": "numAndText", "label": "", "explanation": "number and text","found":numAndText})

    roleKeyword = 0
    if any(keyword in sentence for keyword in roleKeywords):
        roleKeyword = 1
    regList.append({"func": "roleKeyboard", "label": "f", "explanation": "roleKeywords", "found":roleKeyword})    
    
    appreciationKeyword = 0
    if any(keyword in sentence for keyword in appreciation):
        appreciationKeyword = 1
    regList.append({"func": "appreciationKeyword", "label": "f", "explanation": "appreciationKeywords", "found":appreciationKeyword})   
    
    footerTagsCount = 0
    for tag in footerTags:
        if tag in sentence:
            footerTagsCount+=1
    regList.append({"func": "footerTagsCount", "label": "f", "explanation": "footerTagsCount", "found":footerTagsCount})   

    headerKeyword = 0
    if any(keyword in sentence for keyword in headerTags if 'mailto:' not in sentence):
        headerKeyword = 1
    regList.append({"func": "headerKeyword", "label": "f", "explanation": "headerKeyword", "found":headerKeyword})   
    
    departmentKeyword = 0
    if any(keyword in sentence for keyword in departmentKeywords):
        departmentKeyword = 1
    regList.append({"func": "departmentKeyword", "label": "f", "explanation": "departmentKeyword", "found":departmentKeyword})   
   
    addressKeyword = 0
    if any(keyword in sentence for keyword in addressTags):
        addressKeyword = 1
    regList.append({"func": "addressKeyword", "label": "f", "explanation": "addressKeyword", "found":addressKeyword})   
   
    #NLP
    numVerbs = 0
    doc = nlp(sentence)
    for token in doc:
        if token.pos_ == 'VERB':
            numVerbs+=1
    regList.append({"func": "numVerbs", "label": "", "explanation": "numVerbs", "found":numVerbs})   

    numNouns = 0
    for token in doc:
        if token.pos_ == 'NOUN':
            numNouns+=1
    regList.append({"func": "numNouns", "label": "", "explanation": "numNouns", "found":numNouns})  

    return pd.DataFrame(regList)

### Testing feature extraction
Let's see how the feature extraction function is performing...

In [146]:
display(pd.DataFrame(getFeatures("""Artificial Intelligence Senior Analyst""")))

Unnamed: 0,explanation,found,func,label
0,email,0,.*\s*[a-z]+\s*[@]{1}.*,
1,Phone: +55 131231231,0,.*\+\d+.*,f
2,Phone2: (123) 456789,0,".*\([0-9]{2,}\)\s*[0-9]+.*",f
3,Phone3: 123-456-789,0,.*[0-9]+[-]{1}[0-9]+[-]{1}.*,f
4,word mars,0,.*mars.*,
5,word royal canin,0,.*royal canin.*,
6,word wrigley,0,.*wrigley.*,
7,phrase9,0,confidentiality. this email and any attachment...,f
8,phrase9,0,Please consider the environment - do you reall...,f
9,phrase9,0,sent from my iphone,f


### Create feature vector using prev/future sequence
- **Receive:** dataframe and group by e-mailID so all messages stay together.
- **Output:** dataframe with sequence features: **{"callrec_u_email":, "label":, "features"}**

Using previous and next lines would increase model accuracy since it provide more information regarding the labels.
If the element does not have previous or next line, consider an empty feature vector.

In [66]:
from sklearn import preprocessing
def ArrangeFeaturesTempSerie(data):
    data = data.copy()
    data['emt_description'] = data['emt_description'].apply(lambda x: cleanUp(x))
    data = data.dropna()
    
    group = data.groupby('callrec_u_email')
    feature = []
    for idx, gp in group:       
        for index, (idx2,el) in enumerate(gp.iterrows()):
            label = None
            totalFeatures = []
            currentFeatures = []
            nextFeatures = []
            prevFeatures = []
            if 'label' in el:
                label = el['label']
            
            if len(gp) < 3: # we don't have the prev and next positions, lets consider the current and zero for the others
                currentFeatures = getFeatures(str(gp.iloc[index]['emt_description']))['found'].values
                prevFeatures = [0]*len(currentFeatures)
                nextFeatures = [0]*len(currentFeatures)
                totalFeatures = np.concatenate([prevFeatures, currentFeatures, nextFeatures]) 
                feature.append({"callrec_u_email":gp.iloc[index]['callrec_u_email'], "label": label , "features":totalFeatures})
                continue
                    
            elif index > 0 and index < len(gp)-1: 
                prevFeatures    = getFeatures(str(gp.iloc[index-1]['emt_description']))['found'].values
                currentFeatures = getFeatures(str(gp.iloc[index]['emt_description']))['found'].values
                nextFeatures    = getFeatures(str(gp.iloc[index+1]['emt_description']))['found'].values
                totalFeatures = np.concatenate([prevFeatures, currentFeatures, nextFeatures])
            elif index == 0:             
                currentFeatures = getFeatures(str(gp.iloc[index]['emt_description']))['found'].values
                prevFeatures    = [0]*len(currentFeatures)
                nextFeatures    = getFeatures(str(gp.iloc[index+1]['emt_description']))['found'].values
                totalFeatures = np.concatenate([prevFeatures, currentFeatures, nextFeatures])               
            elif index == len(gp)-1:
                currentFeatures = getFeatures(str(gp.iloc[index]['emt_description']))['found'].values
                prevFeatures    = getFeatures(str(gp.iloc[index-1]['emt_description']))['found'].values
                nextFeatures    = [0]*len(currentFeatures)
                totalFeatures = np.concatenate([prevFeatures, currentFeatures, nextFeatures])         
            feature.append({"callrec_u_email":gp.iloc[index]['callrec_u_email'], "label": label , "features":totalFeatures})

    return pd.DataFrame(feature)

### Create feature vector using only the current line
- **Receive:** dataframe and group by e-mailID so all messages stay together.
- **Output:** dataframe with features: **{"callrec_u_email":, "label":, "features"}**

In [14]:
def ArrangeFeatures(data):
    data = data.copy()
    data['emt_description'] = data['emt_description'].apply(lambda x: cleanUp(x))
    data = data.dropna()
    group = data.groupby('callrec_u_email')
    feature = []
    for idx, gp in group:
        if len(gp) < 1:
                continue
        
        for index, (idx,el) in enumerate(gp.iterrows()):
            label = None
            currentFeatures = getFeatures(gp.iloc[index]['emt_description'])['found'].values
            if 'label' in el:
                label = el['label']
            feature.append({"callrec_u_email":gp.iloc[index]['callrec_u_email'], "label": label , "features":currentFeatures})
    return pd.DataFrame(feature)

In [165]:
from sklearn import preprocessing
def Preprocessing(df):
    X = [val for val in df['features'].values]
    X_scaled = preprocessing.scale(X) # Scale [-1 - 1]
#     X_scaled = preprocessing.MinMaxScaler().fit_transform(X) # scale [0-1]
    y = df['label'].values
    return X_scaled, y

In [166]:
def TrainTestSplit(df):
    from sklearn import preprocessing
    from sklearn.model_selection import train_test_split

    X_scaled, y = Preprocessing(df)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=42) # 1/3 test; 2/3 train
    return X_train, X_test, y_train, y_test

In [167]:
def TrainClassifier(features):
    from sklearn import svm
    from sklearn import metrics
    from sklearn import preprocessing
    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    
    # SVM #########
    # svc = svm.SVC(kernel='linear', gamma="scale")
    # parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
    # clf = GridSearchCV(svc, parameters, cv=5, return_train_score=True)
    #################################
    
    
    # Random Forest ######
    param_grid = { 
        'n_estimators': [200, 700, 900],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    rf = RandomForestClassifier(n_jobs=4)
    clf = GridSearchCV(rf, param_grid, cv=5)
    ################################
        
#     KfoldMeasurement(clf, features)

    X_train, X_test, y_train, y_test = TrainTestSplit(features)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    # Results
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.accuracy_score(y_test, y_pred))
    
    print(clf.best_params_)
    featureImportance = clf.best_estimator_.feature_importances_
    sortFeatureImportanceIdx = np.argsort(featureImportance)[::-1]    
    
    # Plot feature importance
    featuresLabel = getFeatures("")['explanation'].values        
    df = pd.DataFrame({"features": [featuresLabel[x%len(featuresLabel)] for x in sortFeatureImportanceIdx], "val": [featureImportance[x] for x in sortFeatureImportanceIdx]})    
    df = df.groupby('features').sum().reset_index()
    df = df.sort_values(['val'], ascending=False)
    data = [go.Bar(x=df.features, y=df.val)]
    iplot(data, filename='basic_bar')

    # Plot feature past / future importance
    # prev: 0 -> len(featuresLabel)
    # current: len(featuresLabel) -> 2*len(featuresLabel)
    # next: 2*len(featuresLabel) -> 3*len(featuresLabel)
    featureLabelOverTime = []
    for idx in sortFeatureImportanceIdx:
        if idx > 0 and idx < len(featuresLabel)-1:
            featureLabelOverTime.append({"features":"%s-%s" % ("prev", str(featuresLabel[idx%len(featuresLabel)])), "val":featureImportance[idx]})
        elif idx > len(featuresLabel)-1 and idx < 2*(len(featuresLabel)-1):
            featureLabelOverTime.append({"features":"%s-%s" % ("actual", str(featuresLabel[idx%len(featuresLabel)])), "val":featureImportance[idx]})
        elif idx > 2*(len(featuresLabel)-1) and idx < 3*(len(featuresLabel)-1):
            featureLabelOverTime.append({"features":"%s-%s" % ("next", str(featuresLabel[idx%len(featuresLabel)])), "val":featureImportance[idx]})                                                   
    
    df = pd.DataFrame(featureLabelOverTime)    
    data = [go.Bar(x=df.features, y=df.val)]
    iplot(data, filename='basic_bar')
                                                                             
    return clf

In [168]:
from sklearn.model_selection import KFold
from sklearn import metrics

def KfoldMeasurement(clf, features):
    
    X = [val for val in features['features'].values]
    X_scaled = preprocessing.MinMaxScaler().fit_transform(X) # scale [0-1]
    y = features['label'].values
    
    kf = KFold(n_splits=4)
    for train_index, test_index in kf.split(X_scaled):

        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(metrics.classification_report(y_test, y_pred))

In [169]:
def binaryClassification(df):
    clsList = ['o', 'b']
    print(df.groupby(['label']).size())
    model = TrainClassifier(df)
    return model, clsList

In [170]:
def multiClassClassification(df):
    clsList = ['h', 'f', 'b']
    print(df.groupby(['label']).size())
    model = TrainClassifier(df)
    return model, clsList

In [21]:
def applyModelToFile(model, filepath):
    pd.options.display.max_rows = 999
    pd.options.display.max_columns = 999
    pd.set_option('display.max_colwidth', -1)
    pd.options.mode.chained_assignment = None  # default='warn'

    ##### Predicting in a csv file ######
    df_test = pd.read_csv(filepath, encoding = "latin-1")

    # display(df_test.head(2))

    df_test = df_test[5000:10000]
    df_test['emt_description'] = df_test['emt_description'].apply(lambda x: cleanUp(x))
    df_test = df_test.dropna()

    gpTestDf = df_test.groupby('callrec_u_email')
    result = pd.DataFrame([])
    for val, gp in gpTestDf:
        features = ArrangeFeaturesTempSerie(gp)
        if len(features) > 0:
            X_scaled, y = Preprocessing(features)
            y_pred = model.predict(X_scaled)
            y_label = [clsList[val] for val in y_pred]
            gp['label'] = y_label
            result = result.append(gp)
    result.drop(columns=['Unnamed: 0'])
    result.to_csv('autoGenTrain.csv', encoding='latin-1')

In [22]:
def ArrangeMultiClassLabels(df):
    dfMulticlass = df.copy()
    dfMulticlass['label'].loc[(dfMulticlass['label'] == 'h')] = 0
    dfMulticlass['label'].loc[(dfMulticlass['label'] == 'f')] = 1
    dfMulticlass['label'].loc[(dfMulticlass['label'] == 'b')] = 2
    return dfMulticlass

In [23]:
def ArrangeBinaryClassLabels(df):
    # Merging the class header and body
    # header, footer = 0
    # body           = 1
    dfBinomial = df.copy()
    dfBinomial['label'].loc[(dfBinomial['label'] == 'h') | (dfBinomial['label'] == 'f')] = 0
    dfBinomial['label'].loc[(dfBinomial['label'] == 'b')] = 1
    return dfBinomial

In [24]:
df = pd.read_csv('datasets/3kTrain.csv', encoding = "latin-1")
display(df.head(2))

Unnamed: 0,callrec_u_email,emt_description,label
0,EMT0007897,"Hi Mars Service Desk,",b
1,EMT0007897,These users' accounts are also locked. Could y...,b


### Extract Features using prev and next lines - multi-class and binomial

In [147]:
# Extract features using the last and next line
featuresSeqDf = ArrangeFeaturesTempSerie(df)
# dfBinomialSeqDf = ArrangeBinaryClassLabels(featuresSeqDf)
dfMulticlassSeqDf = ArrangeMultiClassLabels(featuresSeqDf)

In [148]:
display(len(dfMulticlassSeqDf['features'][0]))

105

### Extract Features using the current line only - multi-class and binomial

In [114]:
# Extract features using the current line only
# featuresDf = ArrangeFeatures(df)
# dfBinomialDf = ArrangeBinaryClassLabels(featuresDf)
dfMulticlassDf = ArrangeMultiClassLabels(featuresDf)

### Train classifiers - multi-class 

In [173]:
modelMultiClassSeq, clsList = multiClassClassification(dfMulticlassSeqDf)
# modelMultiClass, clsList = multiClassClassification(dfMulticlassDf)

label
0    1046
1    2184
2    2685
dtype: int64
              precision    recall  f1-score   support

           0       0.98      0.97      0.98       360
           1       0.94      0.92      0.93       706
           2       0.93      0.95      0.94       886

   micro avg       0.95      0.95      0.95      1952
   macro avg       0.95      0.95      0.95      1952
weighted avg       0.95      0.95      0.95      1952

0.9451844262295082
{'max_features': 'log2', 'n_estimators': 700}


In [203]:
joblib.dump(modelMultiClassSeq, 'modelMultiClassSeq_06_15_18_9452.pkl') 

['modelMultiClassSeq_06_15_18_9452.pkl']

In [33]:
modelBinarySeq, clsList = binaryClassification(dfBinomialSeqDf)
modelBinary, clsList = binaryClassification(dfBinomialDf)

label
0    2954
1    2454
dtype: int64
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      1012
           1       0.94      0.94      0.94       773

   micro avg       0.95      0.95      0.95      1785
   macro avg       0.95      0.95      0.95      1785
weighted avg       0.95      0.95      0.95      1785

0.9467787114845938
{'max_features': 'auto', 'n_estimators': 700}


label
0    2957
1    2456
dtype: int64
              precision    recall  f1-score   support

           0       0.94      0.91      0.92      1022
           1       0.88      0.92      0.90       765

   micro avg       0.91      0.91      0.91      1787
   macro avg       0.91      0.92      0.91      1787
weighted avg       0.92      0.91      0.92      1787

0.91494124230554
{'max_features': 'log2', 'n_estimators': 200}


### Trying the model with a generic e-mail

In [201]:
# Manual test
gp = pd.DataFrame([
#                    {"callrec_u_email": "testCall", "emt_description":"From: Plantenberg, Ulrich"},
#                    {"callrec_u_email": "testCall", "emt_description":"Sent: wednesday, july 19, 2017 5:41 am"},
#                    {"callrec_u_email": "testCall", "emt_description":"To: abcdefghij@abcde.com"},
#                    {"callrec_u_email": "testCall", "emt_description":"Subject: re: email account"},
                   {"callrec_u_email": "testCall", "emt_description":"Hello,"},
                   {"callrec_u_email": "testCall", "emt_description":"No opinions answered oh felicity is resolved hastened"},
                   {"callrec_u_email": "testCall", "emt_description":"Produced it friendly my if opinions humoured"},
                   {"callrec_u_email": "testCall", "emt_description":" Enjoy is wrong folly no taken It sufficient instrument insipidity simplicity at interested. Law pleasure attended differed mrs fat and formerly. Merely thrown garret her law danger him son better excuse."},
                   {"callrec_u_email": "testCall", "emt_description":"Effect extent narrow in up chatty."},
                   {"callrec_u_email": "testCall", "emt_description":"Small are his chief offer happy had"},
                   {"callrec_u_email": "testCall", "emt_description":"t: +55(11)99999-9999"},
                   {"callrec_u_email": "testCall", "emt_description":"best regards,"},
#                  {"callrec_u_email": "testCall", "emt_description":"------------------------------------"},  
                   {"callrec_u_email": "testCall", "emt_description":"Fernando Jr"},
                   {"callrec_u_email": "testCall", "emt_description":"Inventory Control Coordinator"},
                   {"callrec_u_email": "testCall", "emt_description":"e:fernandorovai@hotmail.com"},
                   {"callrec_u_email": "testCall", "emt_description":"t: +55(11)99999-9999"},
                   {"callrec_u_email": "testCall", "emt_description":"t: +55(11)99999-9999"},
                   {"callrec_u_email": "testCall", "emt_description":"From: Plantenberg, Ulrich"},
                   {"callrec_u_email": "testCall", "emt_description":"sent: wednesday, july 19, 2017 5:41 am"},
                   {"callrec_u_email": "testCall", "emt_description":"to: abcdefghij@abcde.com"},
                   {"callrec_u_email": "testCall", "emt_description":"subject: re: email account"},
    
                   {"callrec_u_email": "testCall", "emt_description":"Effect extent narrow in up chatty."},
                   {"callrec_u_email": "testCall", "emt_description":"Small are his chief offer happy had"},
                   {"callrec_u_email": "testCall", "emt_description":"hello,"},
                   {"callrec_u_email": "testCall", "emt_description":"No opinions answered oh felicity is resolved hastened"},
                   {"callrec_u_email": "testCall", "emt_description":"Produced it friendly my if opinions humoured"},
                   {"callrec_u_email": "testCall", "emt_description":" Enjoy is wrong folly no taken It sufficient instrument insipidity simplicity at interested. Law pleasure attended differed mrs fat and formerly. Merely thrown garret her law danger him son better excuse."},
                   {"callrec_u_email": "testCall", "emt_description":"Effect extent narrow in up chatty."},
                   {"callrec_u_email": "testCall", "emt_description":"Thanks,"},
                   {"callrec_u_email": "testCall", "emt_description":"Fernando Junior"},
                   {"callrec_u_email": "testCall", "emt_description":"Senior Analyst"},
                   {"callrec_u_email": "testCall", "emt_description":"Chief Technology Office"},
                   {"callrec_u_email": "testCall", "emt_description":"e:fernandorovai@hotmail.com"},
                   {"callrec_u_email": "testCall", "emt_description":"t: +55(11)99999-9999"},

                  ])
gp['emt_description'] = gp['emt_description'].apply(lambda x: cleanUp(x))
gp = gp.dropna()
clsList = ['h', 'f', 'b']

startTime = time.time()
features = ArrangeFeaturesTempSerie(gp)
X_scaled, y = Preprocessing(features)
y_pred = modelMultiClassSeq.predict(X_scaled)
y_pred_prob = modelMultiClassSeq.predict_proba(X_scaled)
print("Elapsed time %s " % (str(time.time()-startTime)))
y_label = [clsList[val] for val in y_pred]
y_label_prob = [str(x) for x in y_pred_prob]

gp['label'] = y_label
gp['labelProb'] = y_label_prob
display(gp)

Elapsed time 0.8926875591278076 


Unnamed: 0,callrec_u_email,emt_description,label,labelProb
0,testCall,"Hello,",b,[0.00428571 0.02428571 0.97142857]
1,testCall,No opinions answered oh felicity is resolved h...,b,[0.01857143 0.04142857 0.94 ]
2,testCall,Produced it friendly my if opinions humoured,b,[0.00714286 0.05428571 0.93857143]
3,testCall,Enjoy is wrong folly no taken It sufficient in...,b,[0.012 0.09428571 0.89371429]
4,testCall,Effect extent narrow in up chatty.,b,[0.04857143 0.08142857 0.87 ]
5,testCall,Small are his chief offer happy had,b,[0.02428571 0.30857143 0.66714286]
6,testCall,t: +55(11)99999-9999,b,[0.04714286 0.27142857 0.68142857]
7,testCall,"best regards,",b,[0.01857143 0.36428571 0.61714286]
8,testCall,Fernando Jr,f,[0.00571429 0.93 0.06428571]
9,testCall,Inventory Control Coordinator,f,[0.00142857 0.83583333 0.1627381 ]


In [None]:
# Using DeepLearning =)
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import TensorBoard
import time

clsList = ['h', 'f', 'b']
dfMulticlass = df.copy()
dfMulticlass['emt_description'] = dfMulticlass['emt_description'].apply(lambda x: cleanUp(x))
dfMulticlass = dfMulticlass.dropna()
dfMulticlass['label'].loc[(dfMulticlass['label'] == 'h')] = 0
dfMulticlass['label'].loc[(dfMulticlass['label'] == 'f')] = 1
dfMulticlass['label'].loc[(dfMulticlass['label'] == 'b')] = 2
    
features = ArrangeFeaturesTempSerie(dfMulticlass)
X_train, X_test, y_train, y_test = TrainTestSplit(features)

# Hot encode labels
y_train_encoded = np_utils.to_categorical(y_train)
y_test_encoded = np_utils.to_categorical(y_test)
# number of features
nbFeatures = len(X_scaled[0])

# create model
model = Sequential()
model.add(Dense(32, input_dim=nbFeatures, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.summary()

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
tensorboard = TensorBoard(log_dir='logs/{}'.format(time.time()))
model.fit(X_train, y_train_encoded, epochs=150, validation_data=(X_test,y_test_encoded), callbacks=[tensorboard])

In [197]:
clsList = ['o', 'b']
dfBinomial = df.copy()
dfBinomial['emt_description'] = dfBinomial['emt_description'].apply(lambda x: cleanUp(x))
dfBinomial = dfBinomial.dropna()
dfBinomial['label'].loc[(dfBinomial['label'] == 'h') | (dfBinomial['label'] == 'f')] = 0
dfBinomial['label'].loc[(dfBinomial['label'] == 'b')] = 1
features = ArrangeFeaturesTempSerie(dfBinomial)
X_train, X_test, y_train, y_test = TrainTestSplit(features)

# Hot encode labels
y_train_encoded = np_utils.to_categorical(y_train)
y_test_encoded = np_utils.to_categorical(y_test)
# number of features
nbFeatures = len(X_scaled[0])

# create model
model = Sequential()
model.add(Dense(32, input_dim=nbFeatures, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.summary()

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
tensorboard = TensorBoard(log_dir='logs/{}'.format(time.time()))
model.fit(X_train, y_train_encoded, epochs=150, validation_data=(X_test,y_test_encoded), callbacks=[tensorboard])


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 32)                3008      
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 66        
Total params: 3,074
Trainable params: 3,074
Non-trainable params: 0
_________________________________________________________________
Train on 3665 samples, validate on 1806 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epo

Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


<keras.callbacks.History at 0x7f2171d915c0>

### Inference file using the model in a file 

In [None]:
applyModelToFile(model, 'datasets/snow_04_06_19_splittedLines.csv')

### Forget about header

In [29]:
df.head()

Unnamed: 0,callrec_u_email,emt_description,label
0,EMT0007897,"Hi Mars Service Desk,",b
1,EMT0007897,These users' accounts are also locked. Could y...,b
2,EMT0007897,Eric.zuckerman@effem.com,b
3,EMT0007897,Ohmprakash.balaiah@effem.com,b
4,EMT0007897,Sreenivas.belavadi@effem.com,b


In [67]:
dfWoH = df.loc[df['label'] != 'h']

In [68]:
featuresSeqDf = ArrangeFeaturesTempSerie(dfWoH)
dfMulticlassSeqDf = ArrangeMultiClassLabels(featuresSeqDf)

In [69]:
modelMultiClassSeq, clsList = multiClassClassification(dfMulticlassSeqDf)

label
1    1953
2    2457
dtype: int64
              precision    recall  f1-score   support

           1       0.93      0.92      0.92       637
           2       0.94      0.94      0.94       819

   micro avg       0.93      0.93      0.93      1456
   macro avg       0.93      0.93      0.93      1456
weighted avg       0.93      0.93      0.93      1456

0.9313186813186813
{'max_features': 'auto', 'n_estimators': 200}
