In [1]:
import pandas as pd
from nltk.stem import PorterStemmer
import spacy
import re
import collections
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from sklearn.externals import joblib




In [2]:
nlp = spacy.load('en_core_web_md')
stemmer = PorterStemmer()
vocabularySize = 500


In [3]:
df_train = pd.read_csv('./action-items-data-chris/train_balanced_4.csv')
df_dev = pd.read_csv('./action-items-data-chris/dev_balanced_4.csv')
df_test = pd.read_csv('./action-items-data-chris/test_balanced_labelled_4.csv')


In [4]:
print(len(df_train))
print(len(df_dev))
print(len(df_test))


1698
300
184
3


In [5]:
df_train_and_dev = pd.concat([df_train, df_dev]) #let's combine the utterances in the train and dev set
#Why? Because we want to create a vocabulary of the most common words that occur in those utterances

print(len(df_train_and_dev))


1998


In [6]:
def getRidContractions(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub("could", "can", phrase)
    return phrase


In [7]:
def countWords(utterances, nlp):
    listOfWords = []
    
    for utterance in utterances:
        utt1 = utterance.lower() #we turn the utterance to lowercase
        utt2 = getRidContractions(utt1) #we get rid of all contractions
        doc = nlp(utt2) #this uses spacy to tokenize a sentence
        for token in doc:
            if (not token.is_punct): #we ignore punctuation tokens
                tokenStemmed = stemmer.stem(token.text) #we stem each word to increase generalization
                listOfWords.append(tokenStemmed) #we create a list of every word we come across. There are
                #repeats in this list
    
    wordsAndFrequencies = collections.Counter(listOfWords) #this counts how many times the word 'tree' for example
    #occurs in the list called listOfWords
    
    #wordsAndFrequencies will look something like this: Counter({'is': 487, 'have': 219, 'are': 199, 'be': 148, ...
    return wordsAndFrequencies


In [8]:
def preProcessUtterance(utterance, nlp):
    #this function turns an utterance into lower case, calls getRidContractions to turn
    #contractions into their long forms (e.g. won't -> will not), gets rid of punctuation
    #and then uses an nltk function to stem each word.
    
    listOfStemmedTokens = []
    utt1 = utterance.lower()
    utt2 = getRidContractions(utt1)
    doc = nlp(utt2)
    for token in doc:
        if (not token.is_punct): #we ignore punctuation tokens
            tokenStemmed = stemmer.stem(token.text) #we stem each word to increase generalization
            listOfStemmedTokens.append(tokenStemmed)
            
    return listOfStemmedTokens


### Let's find out what the 500 most common words are in the training and dev set (combined)
We'll call this list of 500 words the vocabulary.
For each utterance, we'll create a 500 dimensional one-hot vector. A 1 in the kth spot in the vector means the utterance contains that kth vocabulary word. A 0 means that vocabulary word doesn't occur in the utterance.


In [9]:
trainAndDevUtterances = df_train_and_dev['utterance'].tolist()
#justActionUtterances = df_train_and_dev['utterance'][df_train_and_dev.is_action != 0].tolist()


In [10]:
wordsFreqs = countWords(trainAndDevUtterances, nlp)


In [11]:
mostCommonWords = []
theirFrequencies = []
for wordFreq in wordsFreqs.most_common(vocabularySize):
    word = wordFreq[0]
    freq = wordFreq[1]
    mostCommonWords.append(word)
    theirFrequencies.append(freq)
    
print(mostCommonWords)
print(theirFrequencies)


['the', 'to', 'i', 'and', 'that', 'is', 'we', 'you', 'a', 'it', 'of', 'so', 'have', 'in', 'will', 'on', 'can', 'are', 'do', 'be', 'for', 'just', 'with', 'not', 'if', 'get', 'what', 'go', 'like', 'uh', 'thi', 'then', 'they', 'know', 'up', 'am', 'there', 'or', 'but', 'um', 'work', 'think', 'yeah', 'at', 'as', 'would', 'some', 'out', 'one', 'need', 'want', 'meet', 'wa', 'look', 'thing', 'about', 'them', 'gon', 'na', 'our', 'from', 'well', 'me', 'those', 'now', 'becaus', 'all', 'send', 'see', 'an', 'right', 'how', 'got', 'your', 'when', 'make', 'time', 'realli', 'call', 'he', 'my', 'week', 'over', 'tri', 'okay', 'more', 'back', 'kind', 'talk', 'here', 'someth', 'let', 'actual', 'next', 'say', 'probabl', 'come', ' ', 'put', 'peopl', 'use', 'start', 'where', 'take', 'should', 'were', 'also', 'mayb', 'into', 'by', 'stuff', 'sure', 'email', 'good', 'other', 'guy', 'these', 'us', 'done', 'through', 'their', 'give', 'first', 'littl', 'build', 'him', 'did', 'had', 'ani', 'today', 'two', 'help', '

#### OK so we've created a vocabulary of the 500 most frequently occuring words in the action and non-action utterances in the training and dev sets. Now let's create a 500 dimensional one-hot vector for each utterance in the training set. <br/>

In [12]:
def createOneHotVector(utterance, nlp, mostCommonWords):
    utteranceCleanList = preProcessUtterance(utterance, nlp)
    #above turns to lowercase, gets rid of contractions and punctuation and then it stems words
    
    utteranceVectorNumpy = np.zeros(vocabularySize, dtype=int)
    utteranceVectorList = utteranceVectorNumpy.tolist()
    
    for word in utteranceCleanList:
        if word in mostCommonWords:
            indexPosition = mostCommonWords.index(word)
            utteranceVectorList[indexPosition] = 1
        else:
            #print(word, "not in vocabulary")
            pass
    
    return utteranceVectorList

    

In [13]:
#I don't use this function anymore. It counted the number of named entities in an utterance and these
#frequencies were additional features the algorithms used to decide whether an utterance is an 
#action item or not. However using this information degraded the classifiers' performance and so I 
#stopped using named entity frequencies as a feature.

def countEntities(utterance,nlp):
    time_ents = 0
    person_ents = 0
    I = 0

    doc = nlp(utterance)
    for entity in doc.ents:
        if entity.label_ in ["DATE", "TIME"]:
            #print("time entity:", entity)
            time_ents = time_ents+1
        elif entity.label_ in ["PERSON"]:
            #print("person entity:", entity)
            person_ents = person_ents+1
            
    utteranceCleanList = preProcessUtterance(utterance, nlp)
    I = utteranceCleanList.count('i')
                
    return (time_ents,person_ents,I)
            

In [14]:
trainUtterances = df_train['utterance'].tolist()
trainLabels = df_train['is_action'].tolist()
Ynumpy = np.asarray(trainLabels)
print(Ynumpy.shape)
print(Ynumpy)


(1698,)
[1 0 0 ... 1 0 0]


In [15]:
X = []
for utt in trainUtterances:
    #entityFrequencies = countEntities(utt,nlp) #returns a tuple
    #entityFrequencies = list(entityFrequencies)
    vocabularyVector = createOneHotVector(utt, nlp, mostCommonWords)
    #finalVector = entityFrequencies + vocabularyVector
    X.append(vocabularyVector)


In [16]:
Xnumpy = np.asarray(X)
print(Xnumpy.shape)


(1698, 500)


In [18]:
from sklearn.model_selection import cross_val_score
data_dmatrix_new = xgb.DMatrix(data=Xnumpy, label=Ynumpy)
xgb_cv_new = xgb.XGBClassifier(
 learning_rate=0.1,
 num_class=2,
 n_estimators=200,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softmax',
 nthread=4,
 scale_pos_weight=1,
 seed=27
)
scores = cross_val_score(xgb_cv_new, Xnumpy,Ynumpy, cv=3, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())


Scores: [0.81657848 0.7614841  0.80884956]
Mean: 0.7956373799024011
Standard Deviation: 0.02435527353687716


### Let's find out which words the decision tree finds most useful for distinguishing between action and not action items

In [37]:
from xgboost import plot_importance
from matplotlib import pyplot
xgb_cv_new.fit(Xnumpy,Ynumpy)
sorted_idx = np.argsort(xgb_cv_new.feature_importances_)[::-1]
for index in sorted_idx[0:20]:
    print(index)
    

14
58
16
91
183
49
93
67
52
173
57
296
97
59
2
215
81
166
212
104


In [38]:
print("The words most useful in distinguishing between action and not action items are:")
most_important_distinguishers = sorted_idx[0:20]
for i in most_important_distinguishers:
    print(mostCommonWords[i])

The words most useful in distinguishing between action and not action items are:
will
na
can
let
differ
need
next
send
wa
finish
gon
import
 
our
i
than
week
interest
everyth
should


In [20]:
joblib.dump(xgb_cv_new, "xgb_chris_spam_model.joblib.dat")


['xgb_chris_spam_model.joblib.dat']

## Let's see how the xgboost classifier performs on the dev set

In [21]:
model = joblib.load("xgb_chris_spam_model.joblib.dat")


In [22]:
devUtterances = df_dev['utterance'].tolist()
devLabels = df_dev['is_action'].tolist()
Ydev = np.asarray(devLabels)
print(len(devUtterances))
print(len(Ydev))


300
300


In [23]:
Xdev = []
for utt in devUtterances:
    #entityFrequencies = countEntities(utt,nlp) #returns a tuple
    #entityFrequencies = list(entityFrequencies)
    vocabularyVector = createOneHotVector(utt, nlp, mostCommonWords)
    #finalVector = entityFrequencies + vocabularyVector
    Xdev.append(vocabularyVector)


In [24]:
Xdev = np.asarray(Xdev)
#print(Xnumpy)
print(Xdev.shape)


(300, 500)


In [25]:
Ypred = model.predict(Xdev)
print(len(Ypred))
print(Ypred)


300
[0 0 0 0 1 0 0 0 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 1 0 0 1 1 1 1 0 1 1 0 1 0 0 1 1 1 0 1 0 0 0 1 0 0 1 0 0 1 1 1 0 1 0 0 0
 1 1 0 1 0 0 0 1 1 0 1 0 1 1 1 1 0 0 1 0 1 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1
 1 1 1 1 0 1 1 0 0 0 1 1 1 1 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 1 0
 1 1 1 1 0 0 0 0 1 0 0 0 0 1 1 0 1 1 0 0 1 1 0 0 0 0 1 0 1 1 1 1 0 1 1 1 1
 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 1 0 1 1 0 0 0 0 1 1 0 0
 1 1 0 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 0 1 0 1 0 1 1 1 0 0 1 1 0 1 1 1 1
 1 0 0 1 0 1 0 1 0 1 0 0 0 1 1 1 0 1 1 0 1 1 1 1 0 0 1 0 0 1 0 1 0 1 0 1 1
 0 1 1 1]


In [26]:
print(len(devLabels))
print(devLabels)


300
[0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1]


In [27]:
def evaluateResults(classifierLabels, humanLabels):
    
    numPredictions = len(classifierLabels)
    truePositives = 0
    trueNegatives = 0

    falsePositives = 0
    falseNegatives = 0
    numCorrect = 0
    numWrong = 0

    for i in range(len(humanLabels)): # assuming the lists are of the same length
        if (humanLabels[i]==1):
            if (classifierLabels[i] == 1):
                truePositives = truePositives + 1
                numCorrect = numCorrect + 1
            elif(classifierLabels[i] == 0):
                falseNegatives = falseNegatives + 1
                numWrong = numWrong + 1
        elif(humanLabels[i]==0):
            if (classifierLabels[i] == 0):
                trueNegatives = trueNegatives + 1
                numCorrect = numCorrect + 1
            elif(classifierLabels[i]==1):
                falsePositives = falsePositives + 1
                numWrong = numWrong + 1


    print("true positives:", truePositives)
    print("false negatives:", falseNegatives)
    print("false positives:", falsePositives)
    print()

    accuracy = numCorrect/numPredictions
    precision = truePositives/(truePositives + falsePositives)
    recall = truePositives/(truePositives + falseNegatives)
    
    return (accuracy,precision,recall)


In [28]:
acc,prec,recall = evaluateResults(Ypred, devLabels)
print("accuracy:", acc)
print("precision:", prec)
print("recall:", recall)


true positives: 128
false negatives: 22
false positives: 33

accuracy: 0.8166666666666667
precision: 0.7950310559006211
recall: 0.8533333333333334


## Let's see how the xgboost classifier performs on the test set

In [29]:
testUtterances = df_test['utterance'].tolist()
testLabels = df_test['label'].tolist()
Ytest = np.asarray(testLabels)
print(len(testUtterances))
print(len(Ytest))


184
184


In [30]:
Xtest = []
for utt in testUtterances:
    #entityFrequencies = countEntities(utt,nlp) #returns a tuple
    #entityFrequencies = list(entityFrequencies)
    vocabularyVector = createOneHotVector(utt, nlp, mostCommonWords)
    #finalVector = entityFrequencies + vocabularyVector
    Xtest.append(vocabularyVector)


In [31]:
Xtest = np.asarray(Xtest)
#print(Xnumpy)
print(Xtest.shape)


(184, 500)


In [32]:
Ytest = model.predict(Xtest)
print(len(Ytest))
print(Ytest)


184
[1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 1 0 1 0 1 0 0 1 1 1 1 1 0
 1 1 0 1 1 1 1 0 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0 1 1 0 0
 0 1 1 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 1 1 1 0 1 1
 1 1 1 0 0 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 0 1 1 0 0 0 1 1 1 0
 1 1 1 0 1 1 1 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 1 0 0 1 0 1 1]


In [33]:
print(len(testLabels))
print(testLabels)


184
[1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1]


In [34]:
acc,prec,recall = evaluateResults(Ytest, testLabels)
print("accuracy:", acc)
print("precision:", prec)
print("recall:", recall)


true positives: 89
false negatives: 19
false positives: 14

accuracy: 0.8206521739130435
precision: 0.8640776699029126
recall: 0.8240740740740741


## Let's train and test a catboost model on the same feature vectors

In [41]:
from catboost import CatBoostClassifier, Pool
#test_data = catboost_pool = Pool(X_new, y_new)
model2 = CatBoostClassifier(iterations=100, 
                           depth=5, 
                           learning_rate=0.5, 
                           loss_function='Logloss', 
                           logging_level='Verbose')

#train the model
model2.fit(Xnumpy,Ynumpy)


0:	learn: 0.6393364	total: 75.6ms	remaining: 7.48s
1:	learn: 0.6153570	total: 88.2ms	remaining: 4.32s
2:	learn: 0.5871181	total: 98.4ms	remaining: 3.18s
3:	learn: 0.5644229	total: 109ms	remaining: 2.61s
4:	learn: 0.5400576	total: 119ms	remaining: 2.26s
5:	learn: 0.5284592	total: 129ms	remaining: 2.03s
6:	learn: 0.5184721	total: 140ms	remaining: 1.86s
7:	learn: 0.5075910	total: 150ms	remaining: 1.72s
8:	learn: 0.4980112	total: 160ms	remaining: 1.61s
9:	learn: 0.4888311	total: 169ms	remaining: 1.52s
10:	learn: 0.4782434	total: 179ms	remaining: 1.45s
11:	learn: 0.4695108	total: 188ms	remaining: 1.38s
12:	learn: 0.4613087	total: 198ms	remaining: 1.32s
13:	learn: 0.4562630	total: 207ms	remaining: 1.27s
14:	learn: 0.4422027	total: 217ms	remaining: 1.23s
15:	learn: 0.4335709	total: 226ms	remaining: 1.19s
16:	learn: 0.4241614	total: 236ms	remaining: 1.15s
17:	learn: 0.4168694	total: 247ms	remaining: 1.12s
18:	learn: 0.4101756	total: 257ms	remaining: 1.09s
19:	learn: 0.4040308	total: 267ms	rema

<catboost.core.CatBoostClassifier at 0x10a4da828>

### Let's see how catboost does on the dev set

In [42]:
Ypred = model2.predict(Xdev)
print(len(Ypred))
print(Ypred)
    

300
[1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0.
 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1.
 0. 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0.
 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1.
 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1.
 0. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 1. 1.
 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1. 0. 0.
 1. 1. 0. 0. 0. 0. 1. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 1. 1.
 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 0.
 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0.
 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0.
 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0.
 1. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1.]


In [43]:
print(len(devLabels))
print(devLabels)


300
[0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1]


In [44]:
acc,prec,recall = evaluateResults(Ypred, devLabels)
print("accuracy:", acc)
print("precision:", prec)
print("recall:", recall)


true positives: 118
false negatives: 32
false positives: 34

accuracy: 0.78
precision: 0.7763157894736842
recall: 0.7866666666666666


### Let's see how catboost does on the test set

In [45]:
Ytest = model2.predict(Xtest)
print(len(Ytest))
print(Ytest)


184
[1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0.
 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 0. 0. 1. 0.
 0. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0.
 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0.
 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1.
 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0.
 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1.]


In [46]:
print(len(testLabels))
print(testLabels)


184
[1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1]


In [47]:
acc,prec,recall = evaluateResults(Ytest, testLabels)
print("accuracy:", acc)
print("precision:", prec)
print("recall:", recall)


true positives: 83
false negatives: 25
false positives: 19

accuracy: 0.7608695652173914
precision: 0.8137254901960784
recall: 0.7685185185185185


## Let's train and test a Support Vector Machine (SVM) on the same data

In [48]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


In [49]:
from sklearn import svm

clf = svm.SVC(gamma='scale')
clf.fit(Xnumpy,Ynumpy)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [50]:
Ypred = clf.predict(Xdev)
print(len(Ypred))
print(Ypred)


300
[1 0 0 0 1 0 0 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1 0 1 0 0 1 1 1 0 0 0 1 1
 1 1 0 0 1 1 1 1 0 1 1 0 1 0 0 0 1 1 1 1 0 0 0 1 1 1 1 0 0 1 1 1 0 1 0 0 0
 1 1 0 1 0 0 0 1 1 0 1 0 1 1 1 0 0 0 1 0 0 1 1 0 1 1 0 0 0 0 0 0 0 1 0 0 1
 1 1 1 1 1 1 1 0 0 0 1 1 1 1 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 1 0
 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 1 1 0 0 0 0 1 0 1 0 1 1 1 1 1 0 1
 0 1 1 0 0 1 1 1 1 0 0 1 0 0 1 0 0 1 0 0 0 1 1 1 1 1 0 0 1 1 0 0 0 0 1 0 0
 1 0 0 1 1 1 1 1 0 0 0 1 1 0 1 0 1 0 0 1 0 1 0 1 0 0 1 0 0 0 0 1 0 1 1 1 0
 1 0 1 1 1 0 0 1 1 0 0 0 0 1 1 0 0 1 0 0 1 1 1 1 0 0 1 1 0 1 0 1 0 1 0 0 1
 1 1 1 1]


In [51]:
print(len(devLabels))
print(devLabels)


300
[0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1]


In [52]:
acc,prec,recall = evaluateResults(Ypred, devLabels)
print("accuracy:", acc)
print("precision:", prec)
print("recall:", recall)


true positives: 122
false negatives: 28
false positives: 34

accuracy: 0.7933333333333333
precision: 0.782051282051282
recall: 0.8133333333333334


### Let's see how SVM does on the test set

In [53]:
Ypred = clf.predict(Xtest)
print(len(Ypred))
print(Ypred)


184
[1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0 1 0 1 0 1 0 0 1 1 1 1 1 0
 1 1 0 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 0 0 0 1 0 0 1 1 0 1 1 1 0 0
 0 0 1 0 0 1 0 1 1 1 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 0 1 1 1 1 0 1 1 1 0 1 1
 0 1 1 0 0 1 0 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 0 1 0 0 0 1 1 1 0 0 1 1 1 0
 1 1 1 0 1 1 1 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 0 0 1 0 1 1]


In [54]:
print(len(testLabels))
print(testLabels)


184
[1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1]


In [55]:
acc,prec,recall = evaluateResults(Ypred, testLabels)
print("accuracy:", acc)
print("precision:", prec)
print("recall:", recall)


true positives: 90
false negatives: 18
false positives: 11

accuracy: 0.842391304347826
precision: 0.8910891089108911
recall: 0.8333333333333334


### The SVM classifier does the best of all the bag of words classifiers

In [54]:
precision=87.6
recall=91.7

#84.9,83.3
#86.4,82.4
#86.5,83.3

f1 = (2 * precision * recall)/(precision + recall)
print(f1)

89.60312325711098
