In [61]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from collections import defaultdict
from sklearn.metrics import roc_auc_score

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm import tqdm
from sklearn import utils

import re
import spacy
import string
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
from gensim.models.word2vec import Word2Vec
from spacy.lang.en import English
import spacy
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
encoding="utf-8"
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/battogtokhb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [60]:
new_train_set = pd.read_csv("new_data/train_all.csv")
old_train_set = pd.read_csv("data/train_all.csv")

new_val_set = pd.read_csv("new_data/val_all.csv")
old_val_set = pd.read_csv("data/val_all.csv")

In [3]:
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text

In [4]:
nlp = spacy.load('en')

punctuations = string.punctuation


stop_words = spacy.lang.en.stop_words.STOP_WORDS

parser = English()

def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [80]:
def get_datasets(type='new'):
    if type == 'new':
        train_data = new_train_set['request_text'].apply(spacy_tokenizer)
        train_target = new_train_set['requester_received_pizza']
        val_data = new_val_set['request_text'].apply(cleanText)
        val_target = new_val_set['requester_received_pizza']
    else:
        train_data = old_train_set['request_text_edit_aware'].apply(cleanText)
        train_target = old_train_set['requester_received_pizza']
        val_data = old_val_set['request_text_edit_aware'].apply(cleanText)
        val_target = old_val_set['requester_received_pizza']
        
    return train_data, train_target, val_data, val_target

In [81]:
train_data, train_target, val_data, val_target = get_datasets()

In [82]:
train_data[0]

['went',
 'clinic',
 'doctor',
 'sick',
 '10',
 'days',
 'pizza',
 'spent',
 '27.77',
 'counter',
 'meds',
 'home',
 'pick',
 'prescription',
 'sudafed',
 'supposed',
 'food',
 'honestly',
 'feel',
 'cooking']

## Doc2Vec

In [62]:
def evaluate_doc2vec(train_data, train_target, val_data, val_target):
    val_corpus = [TaggedDocument(words=spacy_tokenizer(text), tags=[val_target[i]]) for i, text in enumerate(val_data)]    
    train_corpus = [TaggedDocument(words=spacy_tokenizer(text), tags=[train_target[i]]) for i, text in enumerate(train_data)]
    
    model = Doc2Vec(dm=0, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
    model.build_vocab(train_corpus)
    
    model.train(train_corpus,total_examples=len(train_corpus), epochs=30)
    
    
    def vec_for_learning(model, tagged_docs):
        sents = tagged_docs
        targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
        return targets, regressors
    
    
    y_train, X_train = vec_for_learning(model, train_corpus)
    y_val, X_val = vec_for_learning(model, val_corpus)
    
    clf = LogisticRegression(solver='lbfgs', max_iter=1000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    return accuracy_score(y_val, y_pred), roc_auc_score(y_val, y_pred) 
    
    

In [63]:
train_data, train_target, val_data, val_target = get_datasets(type='new')
evaluate_doc2vec(train_data, train_target, val_data, val_target)

  ' that document to Beautiful Soup.' % decoded_markup


(0.5480662983425414, 0.5312531328320802)

In [64]:
train_data, train_target, val_data, val_target = get_datasets(type='old')
evaluate_doc2vec(train_data, train_target, val_data, val_target)

  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


(0.6863100634632819, 0.5275559947299078)

In [19]:
old_train_data, old_train_target, old_val_data, old_val_target = get_datasets(type='old')
new_train_data, new_train_target, new_val_data, new_val_target = get_datasets(type='new')

evaluate_doc2vec(old_train_data, old_train_target, new_val_data, new_val_target)

  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


0.4453038674033149

In [20]:
evaluate_doc2vec(new_train_data, new_train_target, old_val_data, old_val_target)

0.4551223934723481

In [21]:
X = []
documents = []

for entry in train_data:
    tokenized = spacy_tokenizer(entry)
    X.append(word_tokenize(entry))
    
for entry in val_data:
    tokenized = spacy_tokenizer(entry)
    X.append(word_tokenize(entry))


X = np.array(X)

In [47]:
import struct 

glove_small = {}
all_words = set(w for words in X for w in words)
with open("/Users/battogtokhb/Downloads/glove.6B.100d.txt", "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        if (word in all_words):
            nums=np.array(parts[1:], dtype=np.float32)
            glove_small[word] = nums

In [23]:

print(len(all_words))

14121


In [24]:
model = Word2Vec(X, size=100, window=5, min_count=5, workers=2)
w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}

  


In [65]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_small))])
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

    
# and a tf-idf version of the same
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_small))])
        else:
            self.dim=0
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [66]:

from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier

In [67]:
lg_glove_small = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_small)), 
                        ("classifier", LogisticRegression(solver='lbfgs', max_iter=1000))])
lg_glove_small_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_small)), 
                        ("classifer", LogisticRegression(solver='lbfgs', max_iter=1000))])


lg_w2v = Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)), 
                        ("extra trees", LogisticRegression(solver='lbfgs', max_iter=1000))])
lg_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)), 
                        ("classifier", LogisticRegression(solver='lbfgs', max_iter=1000))])

In [71]:
def evaluate_w2v(train_data, train_target, val_data, val_target):
    lg_w2v.fit(train_data, train_target)
    predicted = lg_w2v.predict(val_data)
    print ("lg_w2v", accuracy_score(val_target,predicted) ,roc_auc_score(val_target,predicted))
    
    
    
    lg_w2v_tfidf.fit(train_data, train_target)
    predicted = lg_w2v_tfidf.predict(val_data)
    print ("lg_w2v_tfidf", accuracy_score(val_target,predicted), roc_auc_score(val_target,predicted))

In [72]:
evaluate_w2v(new_train_data, new_train_target,new_val_data, new_val_target)

lg_w2v 0.5823204419889503 0.5029949874686717
lg_w2v_tfidf 0.5878453038674033 0.509937343358396


In [73]:
evaluate_w2v(old_train_data, old_train_target,old_val_data, old_val_target)

lg_w2v 0.7506799637352675 0.5
lg_w2v_tfidf 0.7506799637352675 0.5012143170838823


In [74]:
def evaluate_glove(train_data, train_target, val_data, val_target):
    lg_glove_small.fit(train_data, train_target)
    predicted = lg_glove_small.predict(val_data)
    print ("lg_glove_small", accuracy_score(val_target,predicted),  roc_auc_score(val_target,predicted))
    
    
    
    lg_glove_small_tfidf.fit(train_data, train_target)
    predicted = lg_glove_small_tfidf.predict(val_data)
    print ("lg_glove_small_tfidf", accuracy_score(val_target,predicted),  roc_auc_score(val_target,predicted))
    

In [75]:
evaluate_glove(new_train_data, new_train_target,new_val_data, new_val_target)

lg_glove_small 0.5779005524861879 0.5002756892230577
lg_glove_small_tfidf 0.5823204419889503 0.5048120300751879


In [76]:
evaluate_glove(old_train_data, old_train_target,old_val_data, old_val_target)

lg_glove_small 0.7515865820489573 0.5018181818181818
lg_glove_small_tfidf 0.7497733454215775 0.5006104523495828
