## Load Data

In [1]:
############## create sql table ################
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

dbname = 'stack_exchange_rnn_db'
q_tbname = 'question_table'
a_tbname = 'answer_table'
username = 'dan-laptop'
import os
password = os.environ['PGRES_PASSWORD']

engine = create_engine('postgresql://%s:%s@localhost:5432/%s'%(username,password,dbname))

## Now access sql db from python
con = None
connect_str = "dbname='%s' user='%s' host='localhost' password='%s'"%(dbname,username,password)
con = psycopg2.connect(connect_str)
cur = con.cursor() #create cursor for communicating with sql

In [2]:
import pandas as pd
################# make query ########################
sql_query = """
    SELECT answer_table.word_vec, answer_table.score 
    FROM answer_table
    INNER JOIN question_table
        on answer_table.q_id = question_table.q_id
        and question_table.view_count > 50
    LIMIT 10000;
"""
question_df = pd.read_sql_query(sql_query,con)
question_df.head()

Unnamed: 0,word_vec,score
0,"the,error,object,may,read,from,the,network,net...",0
1,"this,is,what,i,did,worked,for,me,when,reading,...",0
2,"possible,duplicate,stackoverflow,com,questions...",0
3,"you,are,looking,for,idxmax,in,1332,x,out,1332,...",2
4,"x,max,x,max,x,max,axis,1,max,index,this,works,...",0


## Load Model

In [3]:
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import sent_tokenize

p_stemmer = PorterStemmer()
tokenizer = RegexpTokenizer('\w+')

In [4]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.layers import Dense, Input, GRU, Embedding
from keras.models import Model
import six.moves.cPickle as cPickle

MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 300
word_index_length = 374000

embedding_layer = Embedding(word_index_length + 1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = GRU(128, dropout_W=0.2, dropout_U=0.2)(embedded_sequences)
preds = Dense(2, activation='softmax')(x)

mymodel = Model(sequence_input, preds)
mymodel.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])
mymodel.load_weights('../Data_and_Models/stackex_gru.h5')
print('Gimme that overflow!')
vectorizer = cPickle.load(open('../Data_and_Models/rnn_tokenizer.pkl', 'rb'), encoding='latin1')
lg = mymodel

Using TensorFlow backend.


Gimme that overflow!


## Data Preprocessing

In [5]:
import numpy as np

def fetch_and_clean(text,model_prep=False):
    raw = BeautifulSoup(text, "lxml").get_text()
    raw = raw.lower()

    if model_prep == False:
        tokens = text_to_word_sequence(raw)
    elif model_prep == True:
        tokens = tokenizer.tokenize(raw)
        tokens = [word for word in tokens if not word.isdigit()]
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        tokens = [p_stemmer.stem(i) for i in tokens]
    return tokens

## Build Model Pipeline

In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

class MyRNN(BaseEstimator, TransformerMixin):
    def __init__(self, lg):
        self.lg = lg

    def predict(self, a_vect):
        just_tok = [fetch_and_clean(x,model_prep=False) for x in a_vect]
        just_tok = [vectorizer.texts_to_sequences(x) for x in just_tok]
        temp = []
        for items in just_tok: temp.append([x[0] for x in items if len(x)>0])
        just_tok = temp

        padded_seq = pad_sequences(just_tok,maxlen=MAX_SEQUENCE_LENGTH)
        seq = np.array(padded_seq)
        
        return self.lg.predict(seq)

In [7]:
RNN = MyRNN(lg)

In [8]:
question_df.ix[0]['word_vec']

"the,error,object,may,read,from,the,network,network,is,not,seekable,you,can't,go,back,in,the,general,case,you,could,replace,err,with,a,new,httperror,instance,that,reads,from,a,buffer,like,io,bytesio,instead,of,the,network,e,g,not,tested,content,err,read,self,log,exception,content,raise,httperror,err,url,err,code,err,reason,err,headers,io,bytesio,content,though,i'm,not,sure,that,you,should,handle,the,error,in,a,single,place,instead,e,g,reraise,a,more,application,specific,exception,or,leave,the,logging,to,an,upstream,handler"

In [9]:
RNN.predict([question_df.ix[0]['word_vec']])

array([[ 0.67328143,  0.32671857]], dtype=float32)

## Use Lime to see what my model is learning!

In [None]:
#%%time

import pickle

class_names = ['unhelpful', 'helpful']
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_names)

RNN = MyRNN(lg)
new = False

if new == True:
    word_dict = {}
row_gen = question_df['word_vec'][1500:].iteritems()

name = 'lime_dict_train'
with open('../Data_and_Models/' + name + '.pkl', 'rb') as f:
    word_dict = pickle.load(f)
    
for count,items in enumerate(row_gen):
    exp = explainer.explain_instance(items[1], RNN.predict, num_features=6)
    word_list = exp.as_list()
    for words in word_list:
        if words[0] in word_dict.keys():
            word_dict[words[0]] = np.append(word_dict[words[0]],words[1])
        else:
            word_dict[words[0]] = words[1]
    if count == 600: break

#check out the results. 
import pickle

name = 'lime_dict_train'
with open('../Data_and_Models/'+ name + '.pkl', 'wb') as f:
    pickle.dump(word_dict, f, pickle.HIGHEST_PROTOCOL)
    
import numpy as np
num_wanted = 20
imp_word_dict = {}

for i in range(num_wanted):
    #this is convoluted because words that have only been seen once are irratic
    max_key = max(word_dict.keys(), key=(lambda x: 0 if isinstance(word_dict[x], float) else abs(np.mean(word_dict[x]))))
    imp_word_dict[max_key] = np.mean(word_dict[max_key])
    word_dict.pop(max_key, None)
    
with open('../Data_and_Models/' + name + '.pkl', 'rb') as f:
    word_dict = pickle.load(f)
    
print('done')

  return _compile(pattern, flags).split(string, maxsplit)
