# Preprocessing Data

In [2]:
import pandas as pd
import numpy as np
from bert_embedding import BertEmbedding
from bert_serving.client import BertClient
import pickle
from gensim.models import Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [6]:
# load data to a csv file
def readFile(filename,label):
    filename = 'resource/'+filename+'.txt'
    with open(filename) as f:
        context = f.readlines()
    df = pd.DataFrame(columns=['text','label'])
    for sms in context:
        df=df.append(pd.DataFrame([[sms,label]], columns=['text','label']), ignore_index=True)
    return df

def convertToCSV():
    df1 = readFile('ham',0)
    df2 = readFile('spam',1)
    newDf = df1.append(df2,ignore_index=True)
    newDf.to_pickle('resource/raw_data')
    
    
# Simplest bag of words----------------------------------------------------------------------------------
def bagOfWords(train, test):
    train_data = train.tolist()
    test_data = test.tolist()

    vectorizer = TfidfVectorizer()
    vectorizer.fit(train_data)

    train_matrix = vectorizer.transform(train_data)
    test_matrix = vectorizer.transform(test_data)

    return train_matrix.toarray(), test_matrix.toarray()

# Stupid Doc2Vec-----------------------------------------------------------------------------------------
# convert sentences to vectors
# https://arxiv.org/pdf/1405.4053v2.pdf
def doc2Vec(data,train = True):
    if train:
        tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
        max_epochs = 100
        vec_size = 200
        alpha = 0.025
        model = Doc2Vec(size=vec_size,
                        alpha=alpha, 
                        min_alpha=0.00025,
                        min_count=1,
                        dm =1)

        model.build_vocab(tagged_data)

        for epoch in range(max_epochs):
        #     print('iteration {0}'.format(epoch))
            model.train(tagged_data,
                        total_examples=model.corpus_count,
                        epochs=model.iter)
            # decrease the learning rate
            model.alpha -= 0.0002
            # fix the learning rate, no decay
            model.min_alpha = model.alpha

        model.save("resource/d2v.model")
        result =[]
        for sent in data:
            test_data = word_tokenize(sent.lower())
            v1 = model.infer_vector(test_data)
            result.append(v1)
        return np.array(result)
    else:
        model= Doc2Vec.load("resource/d2v.model")
        result =[]
        for sent in data:
            test_data = word_tokenize(sent.lower())
            v1 = model.infer_vector(test_data)
            result.append(v1)
        return np.array(result)


# Genius BERT embeddings--------------------------------------------------------------------------------------------

#convert words to vectors
def word2Vec(filename):
    filename ='resource/'+filename+ '.txt'
    with open(filename) as f:
        sentences = f.readlines()
    bert_embedding = BertEmbedding(model='bert_12_768_12', dataset_name='book_corpus_wiki_en_cased')
    result = bert_embedding(sentences)
    return result

#convert sentences to vectors
# https://github.com/hanxiao/bert-as-service
# run bert-serving-start -model_dir /Users/chang/Downloads/cased_L-24_H-1024_A-16 -num_worker=1  in terminal first
def sen2Vec(filename):
    filename ='resource/'+filename+ '.txt'
    with open(filename) as f:
        sentences = f.readlines()
    
    bc = BertClient(check_length=False)
    result = bc.encode(sentences)
    return result

# load and save data-----------------------------------------------------------------------------------------------
def saveAsPickle(data,name):
    with open('resource/'+name+'.pickle','wb') as f:
        pickle.dump(data, f)
def loadAsPickle(name):
    with open('resource/'+name+'.pickle','rb') as f:
        result = pickle.load(f)
    return result

# return array of text for data when bert = 0 else bert matrix
def loadData(bert = 1):
    if bert:
        data = loadAsPickle('sent_vec')
    else:
        with open('resource/'+'all.txt') as f:
            data = f.readlines() 
        data = np.array(data)
    label = pd.read_pickle('resource/raw_data')['label'].to_numpy().astype('int')
    return data,label

In [9]:
# convertToCSV()
matrix = word2Vec('all')
# matrix = sen2Vec('all')
# saveAsPickle(matrix,'sent_vec')

# Run it!!

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [4]:
clfs = [LogisticRegression(), LinearSVC(),KNeighborsClassifier()]

In [8]:
# BERT
data,label = loadData()
X = data
y = label
skf = StratifiedKFold(n_splits=5)
for clf in clfs:
    print(clf)
    for  test_index,train_index in skf.split(data,label):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        print (accuracy_score(y_test,y_pred))
        print (confusion_matrix(y_test,y_pred))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
0.975
[[20  0]
 [ 1 19]]
0.975
[[19  1]
 [ 0 20]]
0.95
[[20  0]
 [ 2 18]]
0.925
[[19  1]
 [ 2 18]]
0.975
[[20  0]
 [ 1 19]]
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
0.975
[[20  0]
 [ 1 19]]
0.95
[[19  1]
 [ 1 19]]
0.95
[[20  0]
 [ 2 18]]
0.975
[[19  1]
 [ 0 20]]
0.975
[[20  0]
 [ 1 19]]
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
0.925
[[18

In [63]:
#doc2vec
data,label = loadData(0)
X = data
y = label
skf = StratifiedKFold(n_splits = 4)
for clf in clfs:
    print(clf)
    for  train_index,test_index in skf.split(data,label):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        X_train = doc2Vec(X_train,True)
        X_test = doc2Vec(X_test,False)
        clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        print (accuracy_score(y_test,y_pred))
        print (confusion_matrix(y_test,y_pred))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
0.5384615384615384
[[7 0]
 [6 0]]
0.3076923076923077
[[4 2]
 [7 0]]
0.5
[[6 0]
 [6 0]]
0.5
[[6 0]
 [6 0]]
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
0.5384615384615384
[[7 0]
 [6 0]]
0.46153846153846156
[[0 6]
 [1 6]]
0.5
[[6 0]
 [6 0]]
0.5
[[6 0]
 [6 0]]
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
0.5384615384615384
[[7 0]
 [6 0]]
0.46153846

In [60]:
#BoW TfidfVectorizer
data,label = loadData(0)
X = data
y = label
skf = StratifiedKFold(n_splits = 4)
for clf in clfs:
    print(clf)
    for  test_index,train_index in skf.split(data,label):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        X_train,X_test = bagOfWords(X_train,X_test)
        clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        print (accuracy_score(y_test,y_pred))
        print (confusion_matrix(y_test,y_pred))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
0.7837837837837838
[[18  0]
 [ 8 11]]
0.7567567567567568
[[10  9]
 [ 0 18]]
0.7631578947368421
[[18  1]
 [ 8 11]]
0.8421052631578947
[[16  3]
 [ 3 16]]
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
0.8918918918918919
[[16  2]
 [ 2 17]]
0.918918918918919
[[17  2]
 [ 1 17]]
0.7631578947368421
[[18  1]
 [ 8 11]]
0.8421052631578947
[[16  3]
 [ 3 16]]
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5,