# Analyzing session descriptions
- Preprocess descriptions to lowercase, remove punctuation, remove extra whitespaces, remove numbers
- Train model to vectorize original session descriptions
- Find similarities between my session notes and original descriptions
- Train neural network with similarity scores and notes
- Use neural network model to write summaries for sessions

In [62]:
# basic packages, data wrangling
import string
import re
import sys
import pandas as pd
import numpy as np

# sklearn, machine learning
from sklearn.cross_validation import train_test_split

# skipthoughts seq2seq vectorizer
sys.path.append('C:/Users/ffarmer/AppData/Local/Continuum/Anaconda2/Lib/skip-thoughts')
import skipthoughts

# gensim modules, word vectorization
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

# Keras, neural networks
from keras.models import Model
from keras.layers.recurrent import LSTM
from keras.layers.core import TimeDistributedDense

In [3]:
# read in pickle file for strata session data
sessions = pd.read_pickle('data_train/strata_sessions.pkl')

# confirm results
sessions.head()

Unnamed: 0,link,topic,descriptions
0,/strata/hadoop-big-data-ny/public/schedule/det...,Parallel SQL and analytics with Solr,Analytics has increasingly become a major focu...
1,/strata/hadoop-big-data-ny/public/schedule/det...,JupyterLab: The evolution of the Jupyter Notebook,Project Jupyter provides building blocks for i...
2,/strata/hadoop-big-data-ny/public/schedule/det...,Designing a location intelligence platform for...,CartoDB has enabled hundreds of thousands of u...
3,/strata/hadoop-big-data-ny/public/schedule/det...,The future of column-oriented data processing ...,"In pursuit of speed and efficiency, big data p..."
4,/strata/hadoop-big-data-ny/public/schedule/det...,Beyond Hadoop at Yahoo: Interactive analytics ...,Yahoo initially built Hadoop as an answer to a...


## Pre-processing

In [14]:
# initialize list
documents = []

# pre-processing
for doc in sessions['descriptions']:
    # remove punction
    no_punc = doc.translate(string.maketrans("",""), string.punctuation)
    
    # convert to lower
    low = no_punc.lower()
    
    # add to list
    documents.append(low)

# add to dataframe
sessions['documents'] = documents 

# confirm results
sessions.head()

Unnamed: 0,link,topic,descriptions,documents
0,/strata/hadoop-big-data-ny/public/schedule/det...,Parallel SQL and analytics with Solr,Analytics has increasingly become a major focu...,analytics has increasingly become a major focu...
1,/strata/hadoop-big-data-ny/public/schedule/det...,JupyterLab: The evolution of the Jupyter Notebook,Project Jupyter provides building blocks for i...,project jupyter provides building blocks for i...
2,/strata/hadoop-big-data-ny/public/schedule/det...,Designing a location intelligence platform for...,CartoDB has enabled hundreds of thousands of u...,cartodb has enabled hundreds of thousands of u...
3,/strata/hadoop-big-data-ny/public/schedule/det...,The future of column-oriented data processing ...,"In pursuit of speed and efficiency, big data p...",in pursuit of speed and efficiency big data pr...
4,/strata/hadoop-big-data-ny/public/schedule/det...,Beyond Hadoop at Yahoo: Interactive analytics ...,Yahoo initially built Hadoop as an answer to a...,yahoo initially built hadoop as an answer to a...


In [32]:
# split into train and test sets
train, test = train_test_split(documents, test_size=0.3)

# add to txt file for model
with open('docs_train.txt', 'w') as f:
    for d in train:
        f.write(d + '\n')
with open('docs_test.txt', 'w') as f:
    for d in test:
        f.write(d + '\n')

## Train doc2vec vectorizer

In [25]:
# custom label line sentence class to read in mulitple document files
# append prefix to labels for document titles
# source: http://linanqiu.github.io/2015/10/07/word2vec-sentiment/
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

In [33]:
# data files
sources = {'data_train/docs_train.txt':'train', 'data_train/docs_test.txt':'test'}

# convert files
data = LabeledLineSentence(sources)

In [40]:
# instantiate model
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=8)

# get model vocabulary
model.build_vocab(data.to_array())

In [41]:
# train model
for epoch in range(20):
    model.train(data.sentences_perm())

print 'done training'

done training


In [42]:
# save model, no need to retrain
model.save('./strata.d2v')

# load model
model = Doc2Vec.load('./strata.d2v')

In [53]:
# see how it works, look at similar words
model.most_similar('government')

[(u'sponsored', 0.5786522030830383),
 (u'telecommunication', 0.5548757314682007),
 (u'trusted', 0.5534477829933167),
 (u'law', 0.492511123418808),
 (u'hottest', 0.48926758766174316),
 (u'similarities', 0.4680299460887909),
 (u'units', 0.45595583319664),
 (u'comprehensive', 0.45538529753685),
 (u'refining', 0.4534202218055725),
 (u'reached', 0.4323597252368927)]

## Compare notes to original summaries

In [72]:
with open('data_strata/fastforwardlabs_unstrtxtsummary.md', 'r') as myfile:
    new_doc_raw = myfile.read().replace('\n', ' ')

In [73]:
# remove punction
no_punc = new_doc_raw.translate(string.maketrans("",""), string.punctuation)
    
# convert to lower
low = no_punc.lower()

# remove numbers
no_num = ''.join([i for i in low if not i.isdigit()])

# remove extra whitespaces
new_doc = re.sub(' +',' ',no_num)

In [74]:
new_doc_vec = model.infer_vector(new_doc)

best = model.docvecs.most_similar([new_doc_vec])
print best

[('train_105', 0.3945472538471222), ('train_79', 0.26070916652679443), ('train_34', 0.253447026014328), ('train_29', 0.23216590285301208), ('train_134', 0.22700904309749603), ('test_35', 0.20641231536865234), ('train_120', 0.2033594399690628), ('train_113', 0.200046569108963), ('train_43', 0.19725462794303894), ('train_4', 0.1964893341064453)]


## Train neural network

In [None]:
(articles, scores), (articles_test, scores_test) = function_that_loads_all_the_training_data()

articles_vectors = skipthoughts.encode(articles)
articles_vectors_test = skipthoughts.encode(articles_test)

model = Model()
model.add(LSTM(512, input_shape=(max_sentences, 4800), dropout_W=0.3, dropout_U=0.3))
model.add(TimeDistributedDense(1))
model.compile(loss='mean_absolute_error', optimizer='rmsprop')
model.fit(articles_vectors, scores, validation_split=0.10)

loss, acc = model.evaluate(articles_vectors_test, scores_test)
print('Test loss / test accuracy = {} / {}'.format(loss, acc))`

## Text summarization