In [39]:
import logging
import random
import requests
import urllib
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from gensim.models import doc2vec
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_recall_curve

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

airtable_list_url = 'https://api.airtable.com/v0/appasJt3vfOA4OjKU/Posts'
airtable_api_key = 'keyvHKcZudc4sSbpZ'
airtable_headers = {'Authorization': 'Bearer ' + airtable_api_key, 'Content-type': 'application/json'}

In [37]:
# TODO convert this to work with our data set
def read_dataset():
    posts_raw = []
    labels_raw = []
    
    offset = False

    while(True):
        formula = urllib.parse.urlencode({'filterByFormula': '{On Topic}!=""'})
    
        if (offset):
            airtable_list_url = 'https://api.airtable.com/v0/appasJt3vfOA4OjKU/Posts?offset=' + offset + '&' + formula
        else:
            airtable_list_url = 'https://api.airtable.com/v0/appasJt3vfOA4OjKU/Posts?' + formula
    
        response = requests.get(airtable_list_url, headers = {'Authorization': 'Bearer ' + airtable_api_key})
        payload = json.loads(response.content)
        
        if not 'records' in payload:
            print(payload)
            raise RuntimeError()
    
        for post in payload['records']:
            if 'Text' in post['fields'] and len(post['fields']['Text']) > 50:
                posts_raw.append(post['fields']['Text'])
                labels_raw.append(post['fields']['On Topic'])
                
        if 'offset' in payload:
            offset = payload['offset']
        else:
            break
    
    posts_series = pd.Series(posts_raw)
    labels_series = pd.Series(labels_raw)
    
    # TODO consider getting more negative data, since we're non representative
    x_train, x_test, y_train, y_test = train_test_split(posts_series, labels_series, random_state=0, test_size=0.2)
    
    # do the all all data set for initial doc2vec construction
    offset = False
    all_posts_raw = []
    while(True):
        formula = urllib.parse.urlencode({'filterByFormula': '{Text}!=""'})
    
        if (offset):
            airtable_list_url = 'https://api.airtable.com/v0/appasJt3vfOA4OjKU/Posts?offset=' + offset + '&' + formula
        else:
            airtable_list_url = 'https://api.airtable.com/v0/appasJt3vfOA4OjKU/Posts?' + formula
    
        response = requests.get(airtable_list_url, headers = {'Authorization': 'Bearer ' + airtable_api_key})
        payload = json.loads(response.content)
        
        if not 'records' in payload:
            print(payload)
            raise RuntimeError()
    
        for post in payload['records']:
            if 'Text' in post['fields'] and len(post['fields']['Text']) > 50:
                all_posts_raw.append(post['fields']['Text'])
        if 'offset' in payload:
            offset = payload['offset']
        else:
            break
    
    x_train = label_sentences(x_train, 'Train')
    x_test = label_sentences(x_test, 'Test')
    all_data = label_sentences(pd.Series(all_posts_raw), 'All') + x_train + x_test
    
    return x_train, x_test, y_train, y_test, all_data


def get_vectors(doc2vec_model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = doc2vec_model.docvecs[prefix]
    return vectors


def train_doc2vec(corpus):
    logging.info("Building Doc2Vec vocabulary")
    # TODO consider tweaking min_count to be higher
    d2v = doc2vec.Doc2Vec(min_count=1,  # Ignores all words with total frequency lower than this
                          window=10,  # The maximum distance between the current and predicted word within a sentence
                          vector_size=300,  # Dimensionality of the generated feature vectors
                          workers=5,  # Number of worker threads to train the model
                          alpha=0.025,  # The initial learning rate
                          min_alpha=0.00025,  # Learning rate will linearly drop to min_alpha as training progresses
                          dm=1)  # dm defines the training algorithm. If dm=1 means ‘distributed memory’ (PV-DM)
                                 # and dm =0 means ‘distributed bag of words’ (PV-DBOW)
    d2v.build_vocab(corpus)

    logging.info("Training Doc2Vec model")
    # 10 epochs take around 10 minutes on my machine (i7), if you have more time/computational power make it 20
    for epoch in range(10):
        logging.info('Training iteration #{0}'.format(epoch))
        d2v.train(corpus, total_examples=d2v.corpus_count, epochs=d2v.iter)
        # shuffle the corpus
        random.shuffle(corpus)
        # decrease the learning rate
        d2v.alpha -= 0.0002
        # fix the learning rate, no decay
        d2v.min_alpha = d2v.alpha

    logging.info("Saving trained Doc2Vec model")
    d2v.save("d2v.model")
    return d2v


def train_classifier(d2v, training_vectors, training_labels):
    logging.info("Classifier training")
    train_vectors = get_vectors(d2v, len(training_vectors), 300, 'Train')
    # TODO consider trying this with decision tree
    # TODO consider running this sample set through MonkeyLearn
    #model = LogisticRegression()
    model = GradientBoostingClassifier()
    model.fit(train_vectors, np.array(training_labels))
    training_predictions = model.predict(train_vectors)
    logging.info('Training predicted classes: {}'.format(np.unique(training_predictions)))
    logging.info('Training accuracy: {}'.format(accuracy_score(training_labels, training_predictions)))
    print(classification_report(training_labels, training_predictions))
    logging.info('Training F1 score: {}'.format(f1_score(training_labels, training_predictions, average='weighted')))
    return model


def test_classifier(d2v, classifier, testing_vectors, testing_labels):
    logging.info("Classifier testing")
    test_vectors = get_vectors(d2v, len(testing_vectors), 300, 'Test')
    testing_predictions = classifier.predict(test_vectors)
    logging.info('Testing predicted classes: {}'.format(np.unique(testing_predictions)))
    print(classification_report(testing_labels, testing_predictions))
    logging.info('Testing accuracy: {}'.format(accuracy_score(testing_labels, testing_predictions)))
    logging.info('Testing F1 score: {}'.format(f1_score(testing_labels, testing_predictions, average='weighted')))

In [24]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the LabeledSentence method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the review.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
    return labeled

In [25]:
x_train, x_test, y_train, y_test, all_data = read_dataset()

In [26]:
d2v_model = train_doc2vec(all_data)

2018-09-04 23:34:45,792 : INFO : Building Doc2Vec vocabulary
2018-09-04 23:34:45,793 : INFO : collecting all words and their counts
2018-09-04 23:34:45,795 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-09-04 23:34:47,591 : INFO : collected 215976 word types and 8586 unique tags from a corpus of 8586 examples and 7020856 words
2018-09-04 23:34:47,591 : INFO : Loading a fresh vocabulary
2018-09-04 23:34:48,392 : INFO : effective_min_count=1 retains 215976 unique words (100% of original 215976, drops 0)
2018-09-04 23:34:48,392 : INFO : effective_min_count=1 leaves 7020856 word corpus (100% of original 7020856, drops 0)
2018-09-04 23:34:49,013 : INFO : deleting the raw counts dictionary of 215976 items
2018-09-04 23:34:49,018 : INFO : sample=0.001 downsamples 37 most-common words
2018-09-04 23:34:49,020 : INFO : downsampling leaves estimated 5718415 word corpus (81.4% of prior 7020856)
2018-09-04 23:34:49,661 : INFO : estimated required memory for 215

2018-09-04 23:35:40,694 : INFO : EPOCH 4 - PROGRESS: at 47.46% examples, 381536 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:35:41,709 : INFO : EPOCH 4 - PROGRESS: at 54.55% examples, 385420 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:35:42,715 : INFO : EPOCH 4 - PROGRESS: at 60.89% examples, 383265 words/s, in_qsize 10, out_qsize 0
2018-09-04 23:35:43,721 : INFO : EPOCH 4 - PROGRESS: at 68.34% examples, 386812 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:35:44,774 : INFO : EPOCH 4 - PROGRESS: at 75.20% examples, 390805 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:35:45,791 : INFO : EPOCH 4 - PROGRESS: at 82.54% examples, 390314 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:35:46,805 : INFO : EPOCH 4 - PROGRESS: at 89.39% examples, 389543 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:35:47,828 : INFO : EPOCH 4 - PROGRESS: at 96.31% examples, 386748 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:35:48,360 : INFO : worker thread finished; awaiting finish of 4 more threads
2018

2018-09-04 23:36:32,230 : INFO : EPOCH - 2 : training on 7020856 raw words (5727434 effective words) took 14.0s, 410102 effective words/s
2018-09-04 23:36:33,239 : INFO : EPOCH 3 - PROGRESS: at 7.37% examples, 435776 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:36:34,251 : INFO : EPOCH 3 - PROGRESS: at 15.36% examples, 436638 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:36:35,266 : INFO : EPOCH 3 - PROGRESS: at 22.86% examples, 433796 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:36:36,278 : INFO : EPOCH 3 - PROGRESS: at 30.62% examples, 438717 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:36:37,290 : INFO : EPOCH 3 - PROGRESS: at 38.82% examples, 437747 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:36:38,322 : INFO : EPOCH 3 - PROGRESS: at 45.11% examples, 423858 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:36:39,326 : INFO : EPOCH 3 - PROGRESS: at 53.55% examples, 430260 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:36:40,333 : INFO : EPOCH 3 - PROGRESS: at 61.53% examples,

2018-09-04 23:37:24,884 : INFO : EPOCH - 1 : training on 7020856 raw words (5726176 effective words) took 13.1s, 436890 effective words/s
2018-09-04 23:37:25,917 : INFO : EPOCH 2 - PROGRESS: at 7.29% examples, 428726 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:37:26,940 : INFO : EPOCH 2 - PROGRESS: at 15.23% examples, 441449 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:37:27,970 : INFO : EPOCH 2 - PROGRESS: at 24.11% examples, 447031 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:37:28,975 : INFO : EPOCH 2 - PROGRESS: at 32.65% examples, 449017 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:37:29,990 : INFO : EPOCH 2 - PROGRESS: at 40.50% examples, 447959 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:37:31,010 : INFO : EPOCH 2 - PROGRESS: at 48.58% examples, 450697 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:37:32,018 : INFO : EPOCH 2 - PROGRESS: at 56.98% examples, 452237 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:37:33,023 : INFO : EPOCH 2 - PROGRESS: at 65.00% examples,

2018-09-04 23:38:16,248 : INFO : training model with 5 workers on 215976 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2018-09-04 23:38:17,280 : INFO : EPOCH 1 - PROGRESS: at 8.08% examples, 458589 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:38:18,292 : INFO : EPOCH 1 - PROGRESS: at 16.21% examples, 460382 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:38:19,320 : INFO : EPOCH 1 - PROGRESS: at 24.67% examples, 462914 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:38:20,346 : INFO : EPOCH 1 - PROGRESS: at 32.68% examples, 465684 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:38:21,355 : INFO : EPOCH 1 - PROGRESS: at 41.26% examples, 468145 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:38:22,360 : INFO : EPOCH 1 - PROGRESS: at 49.35% examples, 465670 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:38:23,362 : INFO : EPOCH 1 - PROGRESS: at 57.86% examples, 464205 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:38:24,362 : INFO : EPOCH 1 - PROGRESS: at 65

2018-09-04 23:39:10,535 : INFO : EPOCH 5 - PROGRESS: at 31.92% examples, 458273 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:39:11,546 : INFO : EPOCH 5 - PROGRESS: at 40.11% examples, 459007 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:39:12,548 : INFO : EPOCH 5 - PROGRESS: at 48.45% examples, 461191 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:39:13,556 : INFO : EPOCH 5 - PROGRESS: at 57.21% examples, 461678 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:39:14,559 : INFO : EPOCH 5 - PROGRESS: at 65.30% examples, 463551 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:39:15,566 : INFO : EPOCH 5 - PROGRESS: at 73.46% examples, 463596 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:39:16,566 : INFO : EPOCH 5 - PROGRESS: at 81.85% examples, 462703 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:39:17,586 : INFO : EPOCH 5 - PROGRESS: at 89.79% examples, 463216 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:39:18,591 : INFO : EPOCH 5 - PROGRESS: at 98.07% examples, 464477 words/s, in_qsiz

2018-09-04 23:40:01,401 : INFO : EPOCH 4 - PROGRESS: at 55.19% examples, 514226 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:40:02,433 : INFO : EPOCH 4 - PROGRESS: at 64.08% examples, 511476 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:40:03,444 : INFO : EPOCH 4 - PROGRESS: at 72.35% examples, 505830 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:40:04,463 : INFO : EPOCH 4 - PROGRESS: at 81.50% examples, 506946 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:40:05,501 : INFO : EPOCH 4 - PROGRESS: at 90.41% examples, 508033 words/s, in_qsize 10, out_qsize 0
2018-09-04 23:40:06,458 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-09-04 23:40:06,468 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-04 23:40:06,473 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-04 23:40:06,499 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-04 23:40:06,500 : INFO : worker thread finished; awaiting fini

2018-09-04 23:40:59,561 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-04 23:40:59,562 : INFO : EPOCH - 2 : training on 7020856 raw words (5727141 effective words) took 14.4s, 399078 effective words/s
2018-09-04 23:41:00,593 : INFO : EPOCH 3 - PROGRESS: at 6.44% examples, 342257 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:41:01,644 : INFO : EPOCH 3 - PROGRESS: at 14.28% examples, 380634 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:41:02,645 : INFO : EPOCH 3 - PROGRESS: at 21.13% examples, 396102 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:41:03,652 : INFO : EPOCH 3 - PROGRESS: at 28.57% examples, 402075 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:41:04,672 : INFO : EPOCH 3 - PROGRESS: at 36.19% examples, 407584 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:41:05,672 : INFO : EPOCH 3 - PROGRESS: at 44.16% examples, 413017 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:41:06,674 : INFO : EPOCH 3 - PROGRESS: at 51.35% examples, 417653 words/s, in_qs

2018-09-04 23:41:52,907 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-04 23:41:52,915 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-04 23:41:52,948 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-04 23:41:52,950 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-04 23:41:52,953 : INFO : EPOCH - 1 : training on 7020856 raw words (5727675 effective words) took 12.9s, 443768 effective words/s
2018-09-04 23:41:53,972 : INFO : EPOCH 2 - PROGRESS: at 7.65% examples, 437109 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:41:54,987 : INFO : EPOCH 2 - PROGRESS: at 16.07% examples, 450138 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:41:56,014 : INFO : EPOCH 2 - PROGRESS: at 23.71% examples, 448809 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:41:57,023 : INFO : EPOCH 2 - PROGRESS: at 32.12% examples, 451154 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:41:58,039 : INFO : EPOCH 2 - PROG

2018-09-04 23:42:43,248 : INFO : training on a 35104280 raw words (28638368 effective words) took 63.2s, 453099 effective words/s
2018-09-04 23:42:43,269 : INFO : Training iteration #7
2018-09-04 23:42:43,270 : INFO : training model with 5 workers on 215976 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2018-09-04 23:42:44,307 : INFO : EPOCH 1 - PROGRESS: at 8.02% examples, 443480 words/s, in_qsize 10, out_qsize 0
2018-09-04 23:42:45,314 : INFO : EPOCH 1 - PROGRESS: at 16.40% examples, 446300 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:42:46,326 : INFO : EPOCH 1 - PROGRESS: at 24.67% examples, 453986 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:42:47,329 : INFO : EPOCH 1 - PROGRESS: at 33.22% examples, 460591 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:42:48,357 : INFO : EPOCH 1 - PROGRESS: at 41.29% examples, 460375 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:42:49,384 : INFO : EPOCH 1 - PROGRESS: at 49.53% examples, 459192 words/s, in_qsize 

2018-09-04 23:43:35,510 : INFO : EPOCH 5 - PROGRESS: at 25.04% examples, 457356 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:43:36,513 : INFO : EPOCH 5 - PROGRESS: at 33.25% examples, 458587 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:43:37,516 : INFO : EPOCH 5 - PROGRESS: at 41.46% examples, 461802 words/s, in_qsize 10, out_qsize 1
2018-09-04 23:43:38,525 : INFO : EPOCH 5 - PROGRESS: at 49.78% examples, 462954 words/s, in_qsize 9, out_qsize 1
2018-09-04 23:43:39,526 : INFO : EPOCH 5 - PROGRESS: at 57.82% examples, 464600 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:43:40,534 : INFO : EPOCH 5 - PROGRESS: at 65.18% examples, 456753 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:43:41,573 : INFO : EPOCH 5 - PROGRESS: at 72.43% examples, 453158 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:43:42,575 : INFO : EPOCH 5 - PROGRESS: at 80.35% examples, 452634 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:43:43,587 : INFO : EPOCH 5 - PROGRESS: at 88.20% examples, 454519 words/s, in_qsi

2018-09-04 23:44:26,859 : INFO : EPOCH 4 - PROGRESS: at 42.02% examples, 474072 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:44:27,869 : INFO : EPOCH 4 - PROGRESS: at 50.50% examples, 475748 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:44:28,870 : INFO : EPOCH 4 - PROGRESS: at 58.47% examples, 476286 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:44:29,885 : INFO : EPOCH 4 - PROGRESS: at 66.92% examples, 477715 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:44:30,886 : INFO : EPOCH 4 - PROGRESS: at 74.94% examples, 480111 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:44:31,894 : INFO : EPOCH 4 - PROGRESS: at 83.62% examples, 479533 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:44:32,923 : INFO : EPOCH 4 - PROGRESS: at 92.42% examples, 479117 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:44:33,690 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-09-04 23:44:33,697 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-04 23:44:33,720 : I

2018-09-04 23:45:18,595 : INFO : EPOCH 3 - PROGRESS: at 42.58% examples, 478892 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:45:19,618 : INFO : EPOCH 3 - PROGRESS: at 50.29% examples, 472731 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:45:20,623 : INFO : EPOCH 3 - PROGRESS: at 58.16% examples, 466949 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:45:21,644 : INFO : EPOCH 3 - PROGRESS: at 66.46% examples, 466986 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:45:22,660 : INFO : EPOCH 3 - PROGRESS: at 75.17% examples, 470751 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:45:23,677 : INFO : EPOCH 3 - PROGRESS: at 84.07% examples, 472639 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:45:24,688 : INFO : EPOCH 3 - PROGRESS: at 92.77% examples, 473279 words/s, in_qsize 9, out_qsize 0
2018-09-04 23:45:25,524 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-09-04 23:45:25,529 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-04 23:45:25,557 : I

In [27]:
# from above:
#2018-09-04 22:27:22,334 : INFO : Saving trained Doc2Vec model
#2018-09-04 22:27:22,336 : INFO : saving Doc2Vec object under d2v.model, separately None
#2018-09-04 22:27:22,337 : INFO : storing np array 'syn1neg' to d2v.model.trainables.syn1neg.npy
#2018-09-04 22:27:22,642 : INFO : storing np array 'vectors' to d2v.model.wv.vectors.npy
#2018-09-04 22:27:23,597 : INFO : saved d2v.model
# TODO figure out how to juad load d2v.model in future versions

classifier = train_classifier(d2v_model, x_train, y_train)

2018-09-04 23:47:35,563 : INFO : Classifier training
2018-09-04 23:47:40,393 : INFO : Training predicted classes: ['No' 'Yes']
2018-09-04 23:47:40,397 : INFO : Training accuracy: 0.9915062287655719
2018-09-04 23:47:40,437 : INFO : Training F1 score: 0.9915163728060931


             precision    recall  f1-score   support

         No       0.98      1.00      0.99       714
        Yes       1.00      0.99      0.99      1052

avg / total       0.99      0.99      0.99      1766



In [28]:
# TODO figure out how to optimize recall of certain label
test_classifier(d2v_model, classifier, x_test, y_test)

2018-09-04 23:47:40,448 : INFO : Classifier testing
2018-09-04 23:47:40,463 : INFO : Testing predicted classes: ['No' 'Yes']
2018-09-04 23:47:40,491 : INFO : Testing accuracy: 0.9027149321266968
2018-09-04 23:47:40,499 : INFO : Testing F1 score: 0.9021579851139084


             precision    recall  f1-score   support

         No       0.91      0.85      0.88       186
        Yes       0.90      0.94      0.92       256

avg / total       0.90      0.90      0.90       442



In [32]:
#output = classifier.predict('This movies really boring. I did not like that the acting was so dry.'.split())

#output

vector1 = d2v_model.infer_vector('We launched last week. We are proud to invest in this company. We are announcing are investment in this company. You can listen in the podcast.'.split())
vector2 = d2v_model.infer_vector("For early stage companies hiring is difficult. To raise a Series A, you need to talk to the right investors. Founders should have this.".split())

output = classifier.predict_proba([vector1, vector2])
#output = classifier.predict([vector1, vector2])
output

# no is higher in first vector, so we want Yes from second element as final score


array([[0.68576883, 0.31423117],
       [0.68649043, 0.31350957]])

In [40]:
formula = urllib.parse.urlencode({'filterByFormula': 'AND({On Topic}="",{Topic Score}="",{Text}!="")'})
airtable_list_url = 'https://api.airtable.com/v0/appasJt3vfOA4OjKU/Posts?' + formula

response = requests.get(airtable_list_url, headers = {'Authorization': 'Bearer ' + airtable_api_key})
payload = json.loads(response.content)

for post in payload['records']:
    proba_output = classifier.predict_proba([d2v_model.infer_vector(post['fields']['Text'])])
    predicted_label = classifier.predict([d2v_model.infer_vector(post['fields']['Text'])])
    
    data = {'fields': {
        'Topic Score': proba_output[0][1],
        'On Topic Prediction': predicted_label[0]
    }}
    
    if True or predicted_label[0] == 'No':
    
        response = requests.patch('https://api.airtable.com/v0/appasJt3vfOA4OjKU/Posts' + '/' + post['id'], headers = airtable_headers, data = json.dumps(data))
    
        if response.status_code != 200:
            print('hit error saving data')
            print(response.content)
            print(data)
            raise RuntimeError(response.content)