In [25]:
# imports
import warnings
warnings.filterwarnings('ignore')

# reading data
import sys
import os
import pandas as pd
import numpy as np

# preprocessing
sys.path.append(os.path.abspath('../scripts'))
import preprocess


# vectorizing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import word2vec
from nltk import word_tokenize
from scipy import sparse
import gensim

# baseline models
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

# models
from sklearn.metrics import accuracy_score
import fasttext

from flair.hyperparameter.param_selection import TextClassifierParamSelector, OptimizationValue
from hyperopt import hp
from flair.hyperparameter.param_selection import SearchSpace, Parameter
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import StackedEmbeddings
from flair.embeddings import FlairEmbeddings
from flair.embeddings import BertEmbeddings
# from flair.embeddings import ELMoEmbeddings
from flair.embeddings import FlairEmbeddings



ModuleNotFoundError: No module named 'gensim'

In [2]:
request_path = "../data/Request_data.tsv"

In [3]:
path = "../data/full_email_intent.csv"

In [4]:
# read in data

def read_data(path):
    return pd.read_csv(path)

In [9]:
data = read_data(path)

In [10]:
data.head()

Unnamed: 0,label,text
0,1,Let's share some research and see if there is ...
1,1,"Did anything get set up? I'm booked at 200, bu..."
2,0,Permitting and Air Quality Issues Developers o...
3,0,Jus call me Nessa ♌ @finesse__ah
4,0,Also how fast does this machine get to.


In [7]:
request_data = pd.read_csv(request_path, sep='\t')

In [8]:
request_data.head()

Unnamed: 0,Sentence,Request
0,I just had to share this with you.,0
1,I hope you haven't seen it already.,0
2,Thought you'd enjoy this.,0
3,This is a handy guide that should be as common...,0
4,Look at you,0


In [11]:
request_data.columns = ["text", "label"]

In [12]:
request_data.head()

Unnamed: 0,text,label
0,I just had to share this with you.,0
1,I hope you haven't seen it already.,0
2,Thought you'd enjoy this.,0
3,This is a handy guide that should be as common...,0
4,Look at you,0


In [13]:
data = request_data

In [18]:
# preprocessing

def preprocess_text(text):
    out = []
    for sentence in text:
        if type(sentence) == str:
            # clean text
            clean = preprocess.clean(sentence)
            # clean info
            clean = preprocess.clean_info(clean)

            out.append(clean)
        else:
            out.append("")
    return out

In [19]:
data["text"]

0                      I just had to share this with you.
1                     I hope you haven't seen it already.
2                               Thought you'd enjoy this.
3       This is a handy guide that should be as common...
4                                             Look at you
                              ...                        
8628    If your request is urgent please send me an em...
8629    Could you please make every effort to get with...
8630    We will try and answer these tomorrow, so plea...
8631    Kara/Heidi, please give this to Brian and Meli...
8632                                                  NaN
Name: text, Length: 8633, dtype: object

In [20]:
data["text"] = preprocess_text(data["text"])

In [21]:
data.head()

Unnamed: 0,text,label
0,I just had to share this with you.,0
1,I hope you haven't seen it already.,0
2,Thought you'd enjoy this.,0
3,This is a handy guide that should be as common...,0
4,Look at you,0


In [26]:
def vectorizer(text_data):
    
    # shuffle df and split into train, dev, and test
    shuffled_data = text_data.sample(frac=1).reset_index(drop=True)
    train = shuffled_data.iloc[0:int(len(data)*0.8)]
    dev = shuffled_data.iloc[int(len(data)*0.8):int(len(data)*0.9)]
    test = shuffled_data.iloc[int(len(data)*0.9):]
    
    # BOW
    print("Getting BOW vectorization.")
    bow_vectorizer = CountVectorizer(stop_words="english")
    train_bow_features = bow_vectorizer.fit_transform(train["text"])
    dev_bow_features = bow_vectorizer.transform(dev["text"])
    test_bow_features = bow_vectorizer.transform(test["text"])
    
    #TFIDF
    print("Getting TFIDF vectorization.")
    tf_vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
    train_tf_features = tf_vectorizer.fit_transform(train["text"])
    dev_tf_features = tf_vectorizer.transform(dev["text"])
    test_tf_features = tf_vectorizer.transform(test["text"])
    
    # word2vec
    print("Getting word2vec vectorization.")
    word2vec = gensim.models.KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin.gz', binary=True)
    
    def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
        if len(tokens_list)<1:
            return np.zeros(k)
        if generate_missing:
            vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
        else:
            vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
        length = len(vectorized)
        summed = np.sum(vectorized, axis=0)
        averaged = np.divide(summed, length)
        return averaged

    def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
        clean_comments["tokens"] = clean_comments["text"].apply(lambda y: word_tokenize(y))
        embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, generate_missing=generate_missing))
        return list(embeddings)
    
    train_w2v_features = sparse.csr_matrix(get_word2vec_embeddings(word2vec, train, generate_missing=True))
    dev_w2v_features = sparse.csr_matrix(get_word2vec_embeddings(word2vec, dev, generate_missing=True))
    test_w2v_features = sparse.csr_matrix(get_word2vec_embeddings(word2vec, test, generate_missing=True))
    
    # return train, dev, and test df with all 3 vectorizations
    return ({"bow": {"train": train_bow_features, "dev": dev_bow_features, "test": test_bow_features},
           "tf": {"train": train_tf_features, "dev": dev_tf_features, "test": test_tf_features},
            "w2v": {"train": train_w2v_features, "dev": dev_w2v_features, "test": test_w2v_features}},
            {"df": {"train": train, "dev": dev, "test": test}})

In [27]:
vectors, df = vectorizer(data)

Getting BOW vectorization.
Getting TFIDF vectorization.
Getting word2vec vectorization.


In [44]:
def test_baseline_model(model_fn, model_name, vectors_dict, df_dict, params={}):
    print("Testing model " + model_name + ".")
    print()
    # build model
    model = model_fn
    
    # find best hyperparameters for model using dev set
    for vect in ["bow", "tf", "w2v"]:
        print("Vectorization: " + vect)
        train = vectors_dict[vect]["train"].toarray()
        train_labels = df_dict["df"]["train"]["label"]
        dev = vectors_dict[vect]["dev"].toarray()
        dev_labels = df_dict["df"]["dev"]["label"]
        if params != {}:
            # try different hyperparameters
            grid = GridSearchCV(model, params, cv=5)
            grid.fit(train, train_labels)
            print("Best cross-validation score: {:.2f}".format(grid.best_score_))
            print(grid.best_estimator_)
#             print("Params: " + grid.best_params_)
        else:
            model.fit(train, train_labels)
            acc = model.score(dev, dev_labels)
            print("Model accuracy: " + str(acc))
    
    # return model_name, best hyperparameters, dev score, and test score
    # try best set of hyperparams on test set
    return

In [45]:
test_baseline_model(GaussianNB(), "nb", vectors, df)

Testing model nb.

Vectorization: bow
Model accuracy: 0.6129779837775203
Vectorization: tf
Model accuracy: 0.7381228273464658
Vectorization: w2v
Model accuracy: 0.6500579374275782


In [46]:
test_baseline_model(LogisticRegression(), "logreg", vectors, df, 
                    params = {"C": [0.001, 0.01, 0.1, 1, 10, 100], 
                                  "tol": [0.01, 0.001, 0.0001]})

Testing model logreg.

Vectorization: bow
Best cross-validation score: 0.87
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.01, verbose=0,
                   warm_start=False)
Vectorization: tf
Best cross-validation score: 0.87
LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.01, verbose=0,
                   warm_start=False)
Vectorization: w2v
Best cross-validation score: 0.91
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_clas

In [47]:
test_baseline_model(LinearSVC(), "linearSVC", vectors, df,
                   params = {"C": [0.001, 0.01, 0.1, 1, 10, 100], 
                                  "tol": [0.01, 0.001, 0.0001]})

Testing model linearSVC.

Vectorization: bow
Best cross-validation score: 0.86
LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.01,
          verbose=0)
Vectorization: tf
Best cross-validation score: 0.87
LinearSVC(C=100, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
          verbose=0)
Vectorization: w2v
Best cross-validation score: 0.91
LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.01,
          verbose=0)


In [None]:
# test each model for best hyperparameters

    # flair

    # fasttext

    # cnn

# return df with model_name, best hyperparameters, dev score, and test score

In [49]:
data.head()

Unnamed: 0,text,label
0,I just had to share this with you.,0
1,I hope you haven't seen it already.,0
2,Thought you'd enjoy this.,0
3,This is a handy guide that should be as common...,0
4,Look at you,0


In [52]:
data.head()

Unnamed: 0,text,label
0,I just had to share this with you.,0
1,I hope you haven't seen it already.,0
2,Thought you'd enjoy this.,0
3,This is a handy guide that should be as common...,0
4,Look at you,0


In [54]:
data["label"] = data["label"].apply(lambda x: "__label__" + str(x))

In [60]:
data = data.sample(frac=1).reset_index(drop=True)
data.iloc[0:int(len(data)*0.8)].to_csv('../data/practice_model/train.csv', sep='\t', index = False, header = None)
data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv('../data/practice_model/test.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.9):].to_csv('../data/practice_model/dev.csv', sep='\t', index = False, header = False);

In [61]:
data.head()

Unnamed: 0,text,label
0,"But most of the alternative energy producers, ...",__label__0
1,Please review the network access specified abo...,__label__1
2,Can you change this and let me know.,__label__1
3,3.,__label__0
4,If you would like to go through the LOPSA line...,__label__1


In [91]:
# fasttext (automatatic hyperparameter optimazation with autotuneValidationFile - 5 min)

ft_model = fasttext.train_supervised("../data/practice_model/train.csv", epoch=10, loss='hs', autotuneValidationFile="../data/practice_model/dev.csv")
test_data = pd.read_csv("../data/practice_model/test.csv", sep='\t', header=None)
test_data.columns = ["text", "label"]

# Evaluation
ft_y_pred = test_data["text"].apply(lambda x: ft_model.predict(x)[0][0] if type(x) == str else "")
ft_acc = accuracy_score(test_data["label"], ft_y_pred)

print("Accuracy on the test dataset with FastText: {:.2f}".format(ft_acc))

Accuracy on the test dataset with FastText: 0.97


In [96]:
# save fasttext model
ft_model.save_model("best_ft_model.bin")

# load fasttest model
# ft_model = fasttext.load_model("best_ft_model.bin")

In [248]:
# flair
# helpful doc - https://towardsdatascience.com/how-to-beat-automl-hyperparameter-optimisation-with-flair-3b2f5092d9f5

# this is the folder in which train, test and dev files reside
data_folder = '../data/practice_model'

# column format indicating which columns hold the text and label(s)
column_name_map = {1: "text", 0: "label_topic"}

# load corpus containing training, test and dev data
corpus: Corpus = CSVClassificationCorpus(data_folder,
                                         column_name_map,
                                         skip_header=False,
                                         delimiter='\t',    
) 

2020-06-06 13:34:57,795 Reading data from ../data/practice_model
2020-06-06 13:34:57,796 Train: ../data/practice_model/train.csv
2020-06-06 13:34:57,797 Dev: ../data/practice_model/dev.csv
2020-06-06 13:34:57,798 Test: ../data/practice_model/test.csv


In [255]:
# stacked embeddings 

glove_embedding = WordEmbeddings('glove')
character_embeddings = CharacterEmbeddings()
flair_forward  = FlairEmbeddings('news-forward-fast')
flair_backward = FlairEmbeddings('news-backward-fast')
bert_embedding = BertEmbeddings()
# elmo_embedding = ELMoEmbeddings()

word_embeddings = [[glove_embedding, character_embeddings, flair_forward, flair_backward, bert_embedding]]

In [None]:
# search space

search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS, hp.choice, options=word_embeddings)
search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128, 256, 512])
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32, 64])
param_selector = TextClassifierParamSelector(
    corpus=corpus, 
    multi_label=False, 
    base_path='resources/results', 
    document_embedding_type='lstm',
    max_epochs=10, 
    training_runs=1,
    optimization_value=OptimizationValue.DEV_SCORE
)
param_selector.optimize(search_space, max_evals=100)

2020-06-06 13:40:45,815 Computing label dictionary. Progress:


100%|██████████| 4184/4184 [00:00<00:00, 4829.12it/s]

2020-06-06 13:40:46,791 [b'__label__No', b'__label__Yes']
  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]2020-06-06 13:40:46,812 ----------------------------------------------------------------------------------------------------
2020-06-06 13:40:46,812 Evaluation run: 1
2020-06-06 13:40:46,813 Evaluating parameter combination:
2020-06-06 13:40:46,814 	dropout: 0.33431104526998534
2020-06-06 13:40:46,817 	embeddings: /Users/nataliewang/.flair/embeddings/glove.gensim,Char,/Users/nataliewang/.flair/embeddings/lm-news-english-forward-1024-v0.2rc.pt,/Users/nataliewang/.flair/embeddings/lm-news-english-backward-1024-v0.2rc.pt,BertEmbeddings(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (

2020-06-06 13:40:46,820 	hidden_size: 64
2020-06-06 13:40:46,821 	learning_rate: 0.2
2020-06-06 13:40:46,822 	mini_batch_size: 32
2020-06-06 13:40:46,822 	rnn_layers: 1
2020-06-06 13:40:46,823 ----------------------------------------------------------------------------------------------------





2020-06-06 13:40:47,801 ----------------------------------------------------------------------------------------------------
2020-06-06 13:40:47,801 Training run: 1
2020-06-06 13:40:48,131 ----------------------------------------------------------------------------------------------------
2020-06-06 13:40:48,134 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): CharacterEmbeddings(
        (char_embedding): Embedding(275, 25)
        (char_rnn): LSTM(25, 25, bidirectional=True)
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_3): FlairEmbeddings(
        (lm): LanguageModel(
          (dro

2020-06-06 13:40:48,136 ----------------------------------------------------------------------------------------------------
2020-06-06 13:40:48,136 Corpus: "Corpus: 3719 train + 465 dev + 465 test sentences"
2020-06-06 13:40:48,138 ----------------------------------------------------------------------------------------------------
2020-06-06 13:40:48,139 Parameters:
2020-06-06 13:40:48,140  - learning_rate: "0.2"
2020-06-06 13:40:48,141  - mini_batch_size: "32"
2020-06-06 13:40:48,142  - patience: "3"
2020-06-06 13:40:48,143  - anneal_factor: "0.5"
2020-06-06 13:40:48,144  - max_epochs: "10"
2020-06-06 13:40:48,145  - shuffle: "True"
2020-06-06 13:40:48,146  - train_with_dev: "False"
2020-06-06 13:40:48,147  - batch_growth_annealing: "False"
2020-06-06 13:40:48,147 ----------------------------------------------------------------------------------------------------
2020-06-06 13:40:48,148 Model training base path: "resources/results"
2020-06-06 13:40:48,149 ----------------------------

In [None]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [None]:
# cnn

# relabel label cols
for df_name in df["df"].keys():
    df["df"][df_name]["pos"] = [0 if label == "0" else 1 for label in df["df"][df_name]]
    df["df"][df_name]["neg"]= [1 if label == "0" else 0 for label in df["df"][df_name]]
    
# define the grid search parameters
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
# max_seq_lens = [25, 50, 75, 100]
# embed_dims = [100, 200, 300, 400, 500]


for vect in ["bow", "tf", "w2v"]:
    print("Vectorization: " + vect)
    
    MAX_SEQUENCE_LENGTH = 50
    EMBEDDING_DIM = 300
    train_word_index = len(vectors[vect]["train"][0])
    
    # pad sequences
    training_sequences = vectors[vect]["train"].tolist()
    dev_sequences = vectors[vect]["dev"].tolist()
    
    train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)
    dev_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
    
    train_labels = df_dict["df"]["train"][["pos", "neg"]].values
    dev_labels = df_dict["df"]["dev"][["pos", "neg"]].values

    param_grid = dict(batch_size=batch_size, epochs=epochs, embedding_dim=EMBEDDING_DIM)

    grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5)
    grid_result = grid.fit(train, train_labels)
        
    # return model_name, best hyperparameters, dev score, and test score
    # try best set of hyperparams on test set
    return

In [None]:
def test_all():
    # preprocess
    
    # different vectorizations
    
    # test models
    
        # naive bayes

        # linear SVM

        # logistic regression

        # flair

        # ulmfit

        # fasttext

        # cnn
        
    # return df of overview of models, hyperparameters, and accuracies
    