## Setup

In [1]:
import os, sys, re, json, time, unittest, datetime, shutil
import itertools, collections
from importlib import reload
from IPython.display import display, HTML

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

**Note:** The data below is only on the 10k datasets for now. This will be updated to leverage the full datasets. 

# Loading Data

In [2]:
users_df = pd.read_json("../dataset/user_10k.json", lines=True)


In [3]:
users_df.head()

Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,...,cool,elite,fans,friends,funny,name,review_count,useful,user_id,yelping_since
0,4.67,0,0,0,0,0,0,0,0,1,...,0,[],0,"[cvVMmlU1ouS3I5fhutaryQ, nj6UZ8tdGo8YJ9lUMTVWN...",0,Johnny,8,0,oMy_rEb0UBEmMlu-zcxnoQ,2014-11-03
1,3.7,0,0,0,0,0,0,0,0,0,...,0,[],0,"[0njfJmB-7n84DlIgUByCNw, rFn3Xe3RqHxRSxWOU19Gp...",0,Chris,10,0,JJ-aSuM4pCFPdkfoZ34q0Q,2013-09-24
2,2.0,0,0,0,0,0,0,0,0,0,...,0,[],0,[],0,Tiffy,1,0,uUzsFQn_6cXDh6rPNGbIFA,2017-03-02
3,4.67,0,0,0,0,0,0,0,0,0,...,0,[],0,[],0,Mark,6,0,mBneaEEH5EMyxaVyqS-72A,2015-03-13
4,4.67,0,0,0,0,0,0,0,0,0,...,0,[],0,[],0,Evelyn,3,0,W5mJGs-dcDWRGEhAzUYtoA,2016-09-08


In [4]:
# All types of reviews - 10K dataset
# reviews_df = pd.read_json("../dataset/review_10k.json", lines=True)

# Just restaurant reviews - 10K dataset
reviews_df = pd.read_json("../dataset/restaurant_reviews_10k.json", lines=True)

# All types of reviews
# reviews_df = pd.read_json("../../../final_project/full_dataset/review.json", lines=True)

In [5]:
reviews_df.head()

Unnamed: 0,business_id,cool,date,funny,stars,text,useful,user_id
0,--6MefnULPED_I942VcFNA,0,2017-08-17,0,4,This is one of my top 3 places to get BBQ pork...,2,FEg8v92qx3kK4Hu4TF28Fg
1,--6MefnULPED_I942VcFNA,0,2017-05-31,0,3,This restaurant is famous for their BBQ dishes...,0,HPtjvIrhzAUkKsiVkeT4MA
2,--6MefnULPED_I942VcFNA,0,2016-10-23,0,2,Roasted pork is one of my favorite things... A...,1,MpvqV7lQcl15rflTBEUhXA
3,--6MefnULPED_I942VcFNA,0,2017-07-30,0,2,I walked by the restaurant more than 5 years a...,1,x-Gbs8sVid3yhJIoHD6Gfw
4,--6MefnULPED_I942VcFNA,0,2017-02-07,1,2,I came here to order a roast duck over rice to...,0,7Dykd1HolQx8mKPYhYDYSg


In [6]:
reviews_df.shape

(10000, 8)

In [7]:
business_df = pd.read_json("../dataset/business_10k.json", lines=True)

In [8]:
business_df.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,"4855 E Warner Rd, Ste B9","{'AcceptsInsurance': True, 'ByAppointmentOnly'...",FYWN1wneV18bWNgQjJ2GNg,"[Dentists, General Dentistry, Health & Medical...",Ahwatukee,"{'Friday': '7:30-17:00', 'Tuesday': '7:30-17:0...",1,33.33069,-111.978599,Dental by Design,,85044,22,4.0,AZ
1,3101 Washington Rd,"{'BusinessParking': {'garage': False, 'street'...",He-G7vWjzVUysIKrfNbPUQ,"[Hair Stylists, Hair Salons, Men's Hair Salons...",McMurray,"{'Monday': '9:00-20:00', 'Tuesday': '9:00-20:0...",1,40.291685,-80.1049,Stephen Szabo Salon,,15317,11,3.0,PA
2,"6025 N 27th Ave, Ste 1",{},KQPW8lFf1y5BT2MxiSZ3QA,"[Departments of Motor Vehicles, Public Service...",Phoenix,{},1,33.524903,-112.11531,Western Motor Vehicle,,85017,18,1.5,AZ
3,"5000 Arizona Mills Cr, Ste 435","{'BusinessAcceptsCreditCards': True, 'Restaura...",8DShNS-LuFqpEWIp0HxijA,"[Sporting Goods, Shopping]",Tempe,"{'Monday': '10:00-21:00', 'Tuesday': '10:00-21...",0,33.383147,-111.964725,Sports Authority,,85282,9,3.0,AZ
4,581 Howe Ave,"{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",PfOCPjBrlQAnz__NXj9h_w,"[American (New), Nightlife, Bars, Sandwiches, ...",Cuyahoga Falls,"{'Monday': '11:00-1:00', 'Tuesday': '11:00-1:0...",1,41.119535,-81.47569,Brick House Tavern + Tap,,44221,116,3.5,OH


In [9]:
checkin_df = pd.read_json("../dataset/checkin_10k.json", lines=True)

In [10]:
checkin_df.head()

Unnamed: 0,business_id,time
0,7KPBkxAOEtb3QeIL9PEErg,"{'Thursday': {'21:00': 4, '1:00': 1, '4:00': 1..."
1,kREVIrSBbtqBhIYkTccQUg,"{'Monday': {'13:00': 1}, 'Thursday': {'20:00':..."
2,tJRDll5yqpZwehenzE2cSg,"{'Monday': {'12:00': 1, '1:00': 1}, 'Friday': ..."
3,r1p7RAMzCV_6NPF0dNoR3g,"{'Thursday': {'23:00': 1}, 'Saturday': {'21:00..."
4,mDdqgfrvROGAumcQdZ3HIg,"{'Monday': {'12:00': 1, '21:00': 1}, 'Wednesda..."


In [11]:
photos_df = pd.read_json("../dataset/photos_10k.json", lines=True)

In [12]:
photos_df.head()

Unnamed: 0,business_id,caption,label,photo_id
0,OnAzbTDn79W6CFZIriqLrA,,inside,soK1szeyan202jnsGhUDmA
1,OnAzbTDn79W6CFZIriqLrA,,inside,dU7AyRB_fHOZkflodEyN5A
2,OnAzbTDn79W6CFZIriqLrA,,outside,6T1qlbBdKkXA1cDNqMjg2g
3,OnAzbTDn79W6CFZIriqLrA,Bakery area,inside,lHhMNhCA7rAZmi-MMfF3ZA
4,XaeCGHZzsMwvFcHYq3q9sA,,food,oHSCeyoK9oLIGaCZq-wRJw


In [13]:
tip_df = pd.read_json("../dataset/tip_10k.json", lines=True)

In [14]:
tip_df.head()

Unnamed: 0,business_id,date,likes,text,user_id
0,tJRDll5yqpZwehenzE2cSg,2012-07-15,0,Get here early enough to have dinner.,zcTZk7OG8ovAmh_fenH21g
1,jH19V2I9fIslnNhDzPmdkA,2015-08-12,0,Great breakfast large portions and friendly wa...,ZcLKXikTHYOnYt5VYRO5sg
2,dAa0hB2yrnHzVmsCkN4YvQ,2014-06-20,0,Nice place. Great staff. A fixture in the tow...,oaYhjqBbh18ZhU0bpyzSuw
3,dAa0hB2yrnHzVmsCkN4YvQ,2016-10-12,0,Happy hour 5-7 Monday - Friday,ulQ8Nyj7jCUR8M83SUMoRQ
4,ESzO3Av0b1_TzKOiqzbQYQ,2017-01-28,0,"Parking is a premium, keep circling, you will ...",ulQ8Nyj7jCUR8M83SUMoRQ


# Baseline Model

In [15]:
n_features = 100000

text = reviews_df["text"]

print("Fitting Count Vectorizer")
# vectorizer = CountVectorizer(max_df=0.95, min_df=2,
#                                 max_features=n_features,
#                                 stop_words='english')
# word_vector = vectorizer.fit_transform(text)

# No setting of hyper-parameters
vectorizer = CountVectorizer()
word_vector = vectorizer.fit_transform(text)

print(np.shape(word_vector))

Fitting Count Vectorizer
(10000, 24872)


In [16]:
#Print example text, stars, and embeddings

print(reviews_df["text"][102])
print(reviews_df["stars"][102])
print(word_vector[102])

At ces trade show and looking for lunch. I show up at 2:03 and the host jokingly says we are closed. We laughed. But he meant it. Last year my burger ordered medium came out almost raw. I am never going back
1
  (0, 17650)	1
  (0, 3376)	1
  (0, 13684)	1
  (0, 12582)	1
  (0, 4549)	1
  (0, 19037)	1
  (0, 11962)	1
  (0, 22483)	1
  (0, 3953)	1
  (0, 10897)	1
  (0, 13729)	1
  (0, 24528)	1
  (0, 12)	1
  (0, 12556)	1
  (0, 13164)	1
  (0, 15363)	1
  (0, 13056)	1
  (0, 19747)	2
  (0, 1101)	1
  (0, 10472)	1
  (0, 1133)	1
  (0, 3582)	1
  (0, 15453)	1
  (0, 14751)	1
  (0, 2016)	1
  (0, 1762)	2
  (0, 9793)	1
  (0, 23190)	1
  (0, 23929)	2
  (0, 1555)	1
  (0, 1239)	2
  (0, 8885)	1
  (0, 3440)	1
  (0, 22022)	1
  (0, 11748)	1
  (0, 14510)	1


## NB Training and Scoring

In [17]:
# x_train_user_reviews = reviews_df["text"][0:6000]
# x_dev_user_reviews = reviews_df["text"][6001:8000]
# x_test_user_reviews = reviews_df["text"][8001:10000]


# x_train_user_reviews = word_vector[0:6000]
# x_dev_user_reviews = word_vector[6001:8000]
x_train_user_reviews = word_vector[0:8000]
x_test_user_reviews = word_vector[8001:10000]

# print("x_train_user_reviews", x_train_user_reviews)
# print("shape x_train_user_reviews", np.shape(x_train_user_reviews))



# y_train_user_stars = reviews_df["stars"][0:6000]
# y_dev_user_stars = reviews_df["stars"][6001:8000]
y_train_user_stars = reviews_df["stars"][0:8000]
y_test_user_stars = reviews_df["stars"][8001:10000]




train_file = x_train_user_reviews
label_file = y_train_user_stars
training_data = x_train_user_reviews






In [18]:
# -> 133             self.config.input_dim = self.training_data.shape[2]
#     134             self.config.step_size = self.training_data.shape[1]
#     135             self.config.label_dim = self.training_label.shape[1]

# # print(training_data.shape[2])
# print(training_data.shape[1])
# print(np.shape(training_data))
# print(len(training_data))
# print("hi")



In [19]:
nb = MultinomialNB()

nb.fit(x_train_user_reviews, y_train_user_stars)

y_pred = nb.predict(x_test_user_reviews)

acc = accuracy_score(y_pred, y_test_user_stars)
print("Accuracy on test set: {:.02%}".format(acc))
# pred_proba = nb.predict_proba(y_pred)
# log_loss_metric = log_loss(y_test_user_stars, pred_proba)
# print("Log-loss on test set: {:.02%}".format(log_loss_metric))

Accuracy on test set: 55.93%


In [20]:
# Print example prediction

print(y_pred[400])

4


In [21]:
# !pip glove

In [22]:
# # Smallest GloVe file
# gloveFile = "../../glove/glove.6B.50d.txt"

# # Smaller GloVe file
# gloveFile = "../../glove/glove.6B.300d.txt"

# # Primary GloVe file
# # gloveFile = "../../glove/glove.42B.300d.txt"

# import numpy as np
# def loadGloveModel(gloveFile):
#     print("Loading Glove Model")
#     f = open(gloveFile,'r')
#     model = {}
#     for line in f:
#         splitLine = line.split()
#         word = splitLine[0]
#         embedding = np.array([float(val) for val in splitLine[1:]])
#         model[word] = embedding
#     print("Done.",len(model)," words loaded!")
#     return model

# loadGloveModel(gloveFile)

In [23]:
# # # Smallest GloVe file
# gloveFile = "../../glove/glove.6B.50d.txt"

# # # Smaller GloVe file
# # gloveFile = "../../glove/glove.6B.300d.txt"

# # # Primary GloVe file
# # gloveFile = "../../glove/glove.42B.300d.txt"





# # def load_glove(self, glove_folder):
# # def load_glove(self):
# def load_glove():
# #     self.console('Loading GloVe embeddings...')
#     print('Loading GloVe embeddings...')
#     glove = {}
#     count = 0
# #     with open(os.path.join(glove_folder, 'glove.6B.' + str(self._parms['embedding_dim']) + 'd.txt'), 'r') as f:
#     with open(gloveFile, 'r') as f:
#         while True:
#             line = f.readline()
#             if not line:
#                 break
#             line = line.split(' ')
#             word = line[0]
#             vector = np.asarray(line[1:], dtype='float32')
#             glove[word] = vector
# #     self._glove = glove
# #     self.console('%d embeddings loaded.' % len(self._glove))

#     print('%d embeddings loaded.' % len(glove))
    
#     return glove

In [24]:
# start_time = datetime.datetime.now()
# print("Start time: ", start_time)

# test_glove = load_glove()

# end_time = datetime.datetime.now()
# print("End time: ", end_time)

# time_taken = end_time - start_time
# print("Time taken: ", time_taken)

In [25]:
# print(type(test_glove))

### RNN with Attention (new)

In [26]:
# import tensorflow as tf
# import os

# # from tensorflow.models.rnn import rnn, rnn_cell

# from tensorflow.python.ops import rnn, rnn_cell

# rnn_cell = tf.nn.rnn_cell


# #rnn= tf.nn.rnn
# rnn= tf.nn.dynamic_rnn


# # train_file = x_train_user_reviews
# # label_file = y_train_user_stars



# import time
# import numpy as np
# import csv
# import random
# import collections
# # import util
# from random import shuffle
# # from util import xavier_weight_init
# import sys

# class Config(object):
#       """Holds model hyperparams and data information.
#       The config class is used to store various hyperparameters and dataset
#       information parameters. Model objects are passed a Config() object at
#       instantiation.
#       """
#       batch_size =32
#       batches_per_epoch =  15
#       step_size= 128 # number of words in a review
#       input_dim= 128 # this is the word vector size
#       hidden_dim = 100 # number of nerons per hidden layer
#       label_dim = 5 # we have a total of classes (like or not like)
#       max_epochs = 500
#       early_stopping = 3
#       dropout =1
#       learning_rate = 0.001
#       forget_bias = 1.0
#       #model = 'RNN' #'BiRNN'
#       model = 'BiRNN'
#       cell_type = 'LSTM'
#       #cell_type = 'GRU'
#       stack = 1
#       use_peepholes = False
#       cell_clip = 1.0
        
        
        
        
# #       train_file = ""
# #       label_file = ""
        
#       train_file = x_train_user_reviews
#       label_file = y_train_user_stars
        
        
        
        
#       run_type = "regression"
#       multi_learn = False
        
        
        
#       train_data_dir = "Data/train"
#       val_data_dir = "Data/val"
        
        
#       attention=True
    
    
    
#       test_data_dir = "Data/test"
        
        
        
#       # train_num_reviews = 1
#       val_num_reviews = 1
#       marker_list = []
#       cur_marker = 0
#       epoch_per_val=4
#       init='norm'
#       weight_dir='default'
#       grad_clip_threshold=5
#       residual=False



# class Models(object):

# #     def read_markers(self, data_dir):
# #         for f in os.listdir(data_dir):
# #             if f[0:8] == 'compress':
# #                 self.config.marker_list.append(f)
# #     def read_train_file(self, data_dir):
#     def read_train_file(self):
#             '''
#             Read the data and label file.
#             assumed file name conventions:
#                 -file starts with x indicates data file, starts with y indicates label file
#                 -file name x_(# words in a review)_(size of the word vector)_(#of reviews in the file)_(corresponding label marker).data
#                 -file name y_(type of label)_(bucket or regression)_(# words in a review)_(size of the word vector)_(#of reviews in the file)_(corresponding label marker).data
#             input: data file directory
#             output:
#                 it outputs a 3 hyper-dimensional structrue as data and a 2 hyper-dimensional structrue as label:
#                 data : [number of reviews [number of words in the review x dimension of word vector]]
#                 label: [number of reviews, [one hot vector if classification, number if regression]]
#             '''

# #             loaded=np.load(os.path.join(data_dir, self.config.marker_list[int(self.config.cur_marker)]))
            
            
            
            
            
            
# #             self.training_data = loaded['training_data']
# #             self.training_label = loaded['training_label']
            
            
#             self.training_data = train_file

#             self.training_label = label_file
            
            
#             self.config.input_dim = self.training_data.shape[2]
#             self.config.step_size = self.training_data.shape[1]
#             self.config.label_dim = self.training_label.shape[1]
#             return

#     def read_val_file(self, data_dir):

#         loaded=np.load(os.path.join(data_dir, 'compress_val.npz'))
#         self.val_data = loaded['training_data']
#         self.val_label = loaded['training_label']

#         return


#     def print_model_params(self):
#         print('*'*99)
#         print( 'Run Type:', str(self.config.run_type))
#         print( 'Model:', self.config.model)

#         print( 'Cell type:', self.config.cell_type)
#         print( 'Hidden Units:', str(self.config.hidden_dim))

#         print( "\n")
#         print( 'Learning rate:', str(self.config.learning_rate))
#         print( 'init:', str(self.config.init))
#         print( 'Dropout:', str(self.config.dropout))
#         print( 'graident threshold', str(self.config.grad_clip_threshold))
#         print( "\n")

#         print( 'attention:', str(self.config.attention))
#         print( 'residual:', str(self.config.residual))
#         print( 'Stack:', str(self.config.stack))
#         print( 'step size:', self.config.step_size)
#         print( 'input dim:', self.config.input_dim)
#         print( 'batch isze', self.config.batch_size)
#         print( "\n")
        
        
        
#         print( 'review per training file', self.config.train_num_reviews)
        
        
        
        
#         print( 'marker list', self.config.marker_list)
#         print( 'Forget Bias:', str(self.config.forget_bias))
#         print( 'Peephole:', str(self.config.use_peepholes))
#         print( '*'*99)

#     def init_variables(self):
#             '''
#             initialize model parameters, note LSTM and BiRNN requires twice the hidden dimenssion due their design
#             '''
#             weight_size=self.config.hidden_dim
#             if self.config.model=='BiRNN':
#                 weight_size_out=2*weight_size
#                 attention_weight = 2*self.config.hidden_dim
#             else:
#                 weight_size_out=weight_size
#                 attention_weight = self.config.hidden_dim

#             if self.config.attention:
#                 weight_size_out = self.config.step_size
#             elif self.config.model!='BiRNN':
#                 weight_size_out = weight_size

#             xavier_initializer = xavier_weight_init()
#             # Define weights and bias
#             with tf.variable_scope(str('test')):
#                 if self.config.init=='norm':
#                       weights_hidden = tf.Variable(tf.random_normal([self.config.input_dim, weight_size])) # Hidden layer weights
#                       weights_out = tf.Variable(tf.random_normal([weight_size_out, self.config.label_dim]))
#                       biases_hidden = tf.Variable(tf.random_normal([weight_size]))
#                       biases_out = tf.Variable(tf.random_normal([self.config.label_dim]))
#                       wegiths_attention=tf.Variable(tf.random_normal([attention_weight]))
#                 elif self.config.init=='xaiver':
#                       weights_hidden = tf.Variable(xavier_initializer((self.config.input_dim, weight_size)))
#                       weights_out = tf.Variable(xavier_initializer((weight_size_out, self.config.label_dim)))
#                       biases_hidden =tf.Variable(xavier_initializer((weight_size,)))
#                       biases_out = tf.Variable(xavier_initializer((self.config.label_dim,)))
#                       wegiths_attention =tf.Variable(xavier_initializer((attention_weight,)))

#                 self.weights = {
#                     'hidden': weights_hidden,
#                     'out1': weights_out
#                 }
#                 self.biases = {
#                    'hidden': biases_hidden,
#                     'out1': biases_out
#                 }
#                 for i in range(self.config.step_size):
#                     self.weights[i]=wegiths_attention#tf.Variable(tf.random_normal([weight_size_out]))
#                     self.biases[i]=tf.Variable(tf.random_normal([self.config.batch_size]))

#     def BiRNN(self, scope):
#             '''
#             bidirection rnn model
#             Note: bidirectional model is most useful when tacking RNNs, in single stack case it just averaging two outputs
#             input: information needed to construct a model. F_bias is only relevant when cell type is LSTM
#             output:
#                 linear combination of the rnn results and output weights
#             '''
#             # input shape: (batch_size, step_size, input_dim)
#             # we need to permute step_size and batch_size(change the position of step and batch size)
#             data = tf.transpose(self.input_data, [1, 0, 2])

#             # Reshape to prepare input to hidden activation
#             # (step_size*batch_size, n_input), flattens the batch and step
#             #after the above transformation, data is now (step_size*batch_size, input_dim)
#             data = tf.reshape(data, [-1, self.config.input_dim])

#             # Define lstm cells with tensorflow
#             with tf.variable_scope(str(scope)):
#                   # Linear activation
#                   data = tf.matmul(data, self.weights['hidden']) + self.biases['hidden']
#                   data = tf.nn.dropout(data, self.config.dropout)
#                   # Define a cell
#                   if self.config.cell_type == 'GRU':
#                       lstm_fw_cell = rnn_cell.GRUCell(self.config.hidden_dim)
#                       lstm_bw_cell = rnn_cell.GRUCell(self.config.hidden_dim)
#                   else:
#                       lstm_fw_cell = rnn_cell.LSTMCell(self.config.hidden_dim, forget_bias=self.config.forget_bias,
#                                                        use_peepholes=self.config.use_peepholes, cell_clip=self.config.cell_clip, state_is_tuple=True)
#                       lstm_bw_cell = rnn_cell.LSTMCell(self.config.hidden_dim, forget_bias=self.config.forget_bias,
#                                                        use_peepholes=self.config.use_peepholes, cell_clip=self.config.cell_clip, state_is_tuple=True)

#                   self.init_state_bw = lstm_bw_cell.zero_state(self.config.batch_size, dtype=tf.float32)
#                   self.init_state_fw = lstm_fw_cell.zero_state(self.config.batch_size, dtype=tf.float32)

#                   # Split data because rnn cell needs a list of inputs for the RNN inner loop
#                   data = tf.split(0, self.config.step_size, data) # step_size * (batch_size, hidden_dim)

#                   if self.config.stack == 2:
#                       print('running stack 2.......')
#                       outputs1, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, data,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN1")
#                       outputs, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs1,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN2")
#                   elif self.config.stack == 3:
#                       print('running stack 3.......')
#                       outputs1, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, data,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN1")
#                       outputs2, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs1,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN2")

#                       outputs, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs2,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN3")
#                   elif self.config.stack == 4:
#                       print('running stack 4.......')
#                       outputs1, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, data,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN1")
#                       outputs2, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs1,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN2")
#                       outputs3, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs2,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN3")
#                       outputs, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs3,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN4")
#                   elif self.config.stack == 5:
#                       print('running stack 5.......')
#                       outputs1, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, data,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN1")
#                       outputs2, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs1,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN2")
#                       outputs3, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs2,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN3")
#                       outputs4, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs3,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN4")
#                       outputs, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs4,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN5")
#                   elif self.config.stack == 6:
#                       print('running stack 6.......')
#                       outputs1, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, data,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN1")
#                       outputs2, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs1,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN2")
#                       outputs3, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs2,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN3")
#                       outputs4, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs3,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN4")
#                       outputs5, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs4,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN5")
#                       outputs, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs5,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN6")
#                   elif self.config.stack == 7:
#                       print('running stack 7.......')
#                       outputs1, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, data,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN1")
#                       outputs2, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs1,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN2")
#                       outputs3, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs2,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN3")
#                       outputs4, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs3,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN4")
#                       outputs5, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs4,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN5")
#                       outputs6, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs5,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN6")
#                       outputs, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, outputs6,
#                                                               initial_state_fw=self.init_state_fw,
#                                                               initial_state_bw=self.init_state_bw, scope="RNN7")
#                   else:
#                       print('running single stack Bi-directional RNN.......')
#                       outputs, output_state_fw,output_state_bw  = tf.nn.bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, data,
#                                                                     initial_state_fw=self.init_state_fw,
#                                                                     initial_state_bw=self.init_state_bw, scope="RNN1")

#                   if self.config.attention:
#                         pred = self.compute_output(outputs, data)
#                   else:
#                         pred = self.compute_output(outputs[-1], data)
#                   return pred


#     def RNN(self, scope):
#             '''
#             standard rnn model
#             input: information needed to construct a model. F_bias is only relevant when cell type is LSTM
#             output:
#                 linear combination of the rnn results and output weights
#             '''
#             # input shape: (batch_size, step_size, input_dim)
#             # we need to permute step_size and batch_size(change the position of step and batch size)
#             data = tf.transpose(self.input_data, [1, 0, 2])
#             # Reshape to prepare input to hidden activation
#             # (step_size*batch_size, n_input), flattens the batch and step
#             #after the above transformation, data is now (step_size*batch_size, input_dim)
#             data = tf.reshape(data, [-1, self.config.input_dim])

#             with tf.variable_scope(str(scope)):
#                   data = tf.nn.dropout(tf.matmul(data, self.weights['hidden']) + self.biases['hidden'], self.config.dropout)

#                   # Define a lstm cell with tensorflow
#                   if self.config.cell_type == 'GRU':
#                       lstm_cell = rnn_cell.GRUCell(self.config.hidden_dim)
#                   else:
#                       lstm_cell = rnn_cell.LSTMCell(self.config.hidden_dim, forget_bias=self.config.forget_bias, state_is_tuple=True)
#                   self.init_state = lstm_cell.zero_state(self.config.batch_size, dtype=tf.float32)
#                   # Split data because rnn cell needs a list of inputs for the RNN inner loop
#                   data = tf.split(0, self.config.step_size, data) # step_size * (batch_size, hidden_dim)

#                   if self.config.stack == 2:
#                       print('running stack 2.......')
#                       output1, states = tf.nn.rnn(lstm_cell, data, initial_state=self.init_state, scope="RNN1")
#                       outputs, states = tf.nn.rnn(lstm_cell, output1, initial_state=self.init_state, scope="RNN2")
#                   elif self.config.stack == 3:
#                       print('running stack 3.......')
#                       output1, states = tf.nn.rnn(lstm_cell, data, initial_state=self.init_state, scope="RNN1")
#                       output2, states = tf.nn.rnn(lstm_cell, output1, initial_state=self.init_state, scope="RNN2")
#                       outputs, states = tf.nn.rnn(lstm_cell, output2, initial_state=self.init_state, scope="RNN3")
#                   elif self.config.stack == 4:
#                       print('running stack 4.......')
#                       output1, states = tf.nn.rnn(lstm_cell, data, initial_state=self.init_state, scope="RNN1")
#                       output2, states = tf.nn.rnn(lstm_cell, output1, initial_state=self.init_state, scope="RNN2")
#                       output3, states = tf.nn.rnn(lstm_cell, output2, initial_state=self.init_state, scope="RNN3")
#                       outputs, states = tf.nn.rnn(lstm_cell, output3, initial_state=self.init_state, scope="RNN4")
#                   else:
#                       print('running single stack RNN.......')
#                       outputs, states = tf.nn.rnn(lstm_cell, data, initial_state=self.init_state, scope="RNN1")

#                   # Get lstm cell output
#                   outputs, states = tf.nn.rnn(lstm_cell, data, initial_state=self.init_state)

#                   # we really just interested in the last state's output
#                   # return [tf.matmul(outputs[-1], self.weights['out1']) + self.biases['out1']]
#                   if self.config.attention:
#                         pred=self.compute_output(outputs)
#                   else:
#                         pred =self.compute_output(outputs[-1])
#                   return pred

#     def compute_output(self, outputs, data):
#             if not self.config.attention:
#                 print('running none attention mode.......')
#                 # Linear activation
#                 # for basic rnn prediction we really just interested in the last state's output, we need to average them in this case
#                 return [tf.nn.dropout(tf.matmul(outputs, self.weights['out1']) + self.biases['out1'], self.config.dropout)]
#             else:
#                 print('running attention mode.......')
#                 # print total_outputs.get_shape()
#                 # print outputs[-1].get_shape()
#                 # we now need to do apply the attention model, the output of each layer comes out from outputs[0], total layer = step_size
#                 # I will first iterate through each layer and multiply the output to its weights
#                 # I will follow the example below, which essentially produces a matrix vector product
#                 # x = tf.constant(5.0, shape=[5, 6])
#                 # w = tf.constant([0.0, 1.0, 2.0, 3.0, 4.0, 5.0])
#                 # xw = tf.mul(x, w)
#                 # max_in_rows = tf.reduce_max(xw, 1), i need to ues reduce sum here
#                 #
#                 # sess = tf.Session()
#                 # print sess.run(xw)
#                 # # ==> [[0.0, 5.0, 10.0, 15.0, 20.0, 25.0],
#                 # #      [0.0, 5.0, 10.0, 15.0, 20.0, 25.0],
#                 # #      [0.0, 5.0, 10.0, 15.0, 20.0, 25.0],
#                 # #      [0.0, 5.0, 10.0, 15.0, 20.0, 25.0],
#                 # #      [0.0, 5.0, 10.0, 15.0, 20.0, 25.0]]
#                 #
#                 # print sess.run(max_in_rows)
#                 # # ==> [25.0, 25.0, 25.0, 25.0, 25.0]
#                 # print self.weights[1].get_shape() #(256,)
#                 # print outputs[27].get_shape() #(?,256)
#                 # attention_list = [tf.reduce_sum(tf.mul(outputs[i], weights[i]),1)+bias[i] for i in range(len(outputs))]
#                 if self.config.residual:
#                     print('running residual mode.......')
#                     data = tf.transpose(self.input_data, [1, 0, 2])
#                     for i in range(self.config.step_size):
#                         data1 = tf.concat(1, [data[i], data[i]])
#                         outputs[i]+=data1
#                 else:
#                     print('running non-residual mode.......')
#                 attention_list = [tf.reduce_sum(tf.mul(outputs[i], self.weights[i]),1)+self.biases[i] for i in range(self.config.step_size)]
#                 #after obtaining the attention list I need to make a vector out of it
#                 attention_vec = tf.transpose(tf.pack(attention_list))
#                 #attention_vec = tf.add(attention_vec,data)
#                 # print self.weights['out1'].get_shape()
#                 pred=[tf.nn.dropout(tf.matmul(attention_vec, self.weights['out1']) + self.biases['out1'], self.config.dropout)]
#                 return pred

#     def add_placeholders(self):
#             '''
#             feeding information to the input placeholders
#             this function is call as the init process, data are feed in by tensor flow graph
#             '''
#             # define graph input place holders
#             self.input_data = tf.placeholder("float", [None, self.config.step_size, self.config.input_dim])
#             self.input_label = tf.placeholder("float", [None, self.config.label_dim])

#     def get_feed_dict(self, data, label):
#         if (self.config.model == 'BiRNN'):
#             feed_dict = {self.input_data: data,
#                          self.input_label: label}
#         else:
#             feed_dict = {self.input_data: data,
#                          self.input_label: label}
#         return feed_dict

#     def run_model(self, scope=None, debug=False):
#             '''
#             this is the core function that launches the model, it initializes the weights and call the model specified in the config
#             after model execution it records the test and training loss.
#             input: model, training data, label, test data/label, and all other paramters needed to run the model
#             output:
#                 the best learning rate found through cross vaildation.
#             '''
#             self.print_model_params()
#             #making predictions, this actives the rnn model
#             if (self.config.model =="BiRNN"): pred = self.BiRNN(scope)
#             elif (self.config.model=="RNN"): pred = self.RNN(scope)

#              # Define loss and optimizer
#             label1 = tf.split(1, self.config.label_dim, self.input_label)

#             if self.config.run_type=='regression':
#                 cost = tf.sqrt(tf.reduce_mean(tf.square(tf.sub(pred[0], self.input_label))))

#             if self.config.run_type=='classification':
#                 cost = tf.reduce_mean(
#                     tf.nn.softmax_cross_entropy_with_logits(pred[0], self.input_label))


#             optimizer = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(cost) # Adam Optimizer

#             #opt_func=tf.train.AdamOptimizer(learning_rate=self.config.learning_rate)
#             #tvars=tf.trainable_variables()

#             #clip the graident
#             # tf.gradients(ys, xs, grad_ys=None, name='gradients', colocate_gradients_with_ops=False, gate_gradients=False, aggregation_method=None)
#             # Constructs symbolic partial derivatives of sum of ys w.r.t. x in xs.
#             #grads, _=tf.clip_by_global_norm(tf.gradients(cost, tvars), self.config.grad_clip_threshold)
#             #optimizer=opt_func.apply_gradients(zip(grads, tvars))

#             #compute accuracy for classification
#             class_one_hot_prediction = tf.argmax(self.input_label, 1)
#             classification_prediction=tf.argmax(tf.nn.softmax(pred[0]),1)
#             classification_acc =tf.reduce_sum(tf.cast(tf.equal(classification_prediction, class_one_hot_prediction), 'int32'))

#              # Initializing the variables
#             init = tf.global_variables_initializer()
#             saver = tf.train.Saver()

#             def ValidationError(_type):
#                 #-------------------------validation starts here-------------------------------------------
#                     val_loss=[]
#                     val_epoch = 0
#                     if _type == 'val':
#                           print('running validation loss')
#                           self.read_val_file(self.config.val_data_dir)
#                     elif _type == 'test':
#                           print('running test loss')
#                           self.read_val_file(self.config.test_data_dir)

#                     train_dropout=self.config.dropout
#                     self.config.dropout = train_dropout
#                     val_i=1
#                     val_last_index = 0
#                     while val_i*self.config.batch_size <= len(self.val_data):
#                         samples=[i for i in range(val_last_index, val_i*self.config.batch_size)]
#                         val_last_index = val_i*self.config.batch_size
#                         val_i+=1
#                         sample = np.array(samples)
#                         input_training_data=self.val_data[sample, :]
#                         input_training_label=self.val_label[sample, :]
#                         feed_dict = self.get_feed_dict(input_training_data, input_training_label)
#                         if self.config.run_type == 'classification':
#                             loss, match = sess.run([cost, classification_acc], feed_dict)
#                             acc= 1.0*match/len(input_training_label)
#                         elif self.config.run_type == 'regression':
#                             acc = sess.run(cost, feed_dict)
#                         val_loss.append(acc)
#                     self.config.dropout=train_dropout
#                     return 1.0*sum(val_loss)/(len(val_loss))

#             def SaveWeights():
#                   # if not os.path.exists("./weights"):
#                   if not os.path.exists("./"+self.config.weight_dir):
#                         os.makedirs("./"+self.config.weight_dir)
#                   path=saver.save(sess, './'+self.config.weight_dir+'/', global_step=None, latest_filename=None, meta_graph_suffix='meta', write_meta_graph=True, write_state=True)

#             saver = tf.train.Saver()
#             #Launch the graph
#             with tf.Session() as sess:
#                 #saver.restore(sess, './'+self.config.weight_dir+'/')
#                 #print 'weights restored...'
#                 #test_accuracy = ValidationError('test')
#                 #print 'test accuracy', test_accuracy
#                 #return test_accuracy

#                 sess.run(init)
#                 best_val_epoch = 0
#                 if self.config.run_type=='classification':
#                     best_val_accuracy= float('-inf')
#                 if self.config.run_type=='regression':
#                     best_val_accuracy= float('inf')
#                 #-------------------------training starts here-------------------------------------------
#                 # I have batches per epoch and epoch per validation check out which is my max_epoch
#                 # I will read one file per time and taking batches out of the file, once the file is exhausted I will move
#                 # on to the next file without interupting the epoch run
#                 # note, number batchs per epoch * batch size must be less than the numver of reviews in a file
#                 index = 1
#                 last_index = 0
#                 total_epoch = 0
#                 val_epoch = 0
#                 val_loss = []
#                 for epoch in xrange(self.config.max_epochs):
#                     total_epoch +=1
#                     val_epoch +=1
#                     train_accuarcy = []
#                     test_accuracy = 0
#                     train_loss = []
#                     counter = 0
#                     # Training
#                     total_traing_data = self.training_label.shape[0]
#                     while counter  < self.config.batches_per_epoch:
#                         current_index=index*self.config.batch_size
#                         if current_index >= total_traing_data:
#                             samples=[i for i in range(total_traing_data-self.config.batch_size, total_traing_data)]
#                             sample = np.array(samples)
#                             #samples=np.random.randint(total_traing_data, size=self.config.batch_size)
#                             input_training_data=self.training_data[sample, :]
#                             input_training_label=self.training_label[sample, :]
#                             index = 1
#                             last_index=0
#                             self.config.cur_marker+=1
#                             if self.config.cur_marker == len(self.config.marker_list): self.config.cur_marker = 0
# #                             self.read_train_file(self.config.train_data_dir)
#                             self.read_train_file()
#                         else:
#                             samples=[i for i in range(last_index, current_index)]
#                             last_index = current_index
#                             index +=1
#                             sample = np.array(samples)
#                             input_training_data=self.training_data[sample, :]
#                             input_training_label=self.training_label[sample, :]

#                         feed_dict = self.get_feed_dict(input_training_data, input_training_label)
#                         sess.run(optimizer, feed_dict)
#                         if self.config.run_type == 'classification':
#                             loss, match = sess.run([cost, classification_acc], feed_dict)
#                             acc= 1.0*match/len(input_training_label)
#                         elif self.config.run_type == 'regression':
#                             acc = sess.run(cost, feed_dict)
#                             loss = acc
#                         train_accuarcy.append(acc)
#                         train_loss.append(loss)
#                         counter += 1

#                     epoch_loss=sum(train_loss)/counter
#                     epoch_acc=sum(train_accuarcy)/counter
#                     print("Epoch " + str(epoch) + ", Loss= " + "{:.6f}".format(epoch_loss) + ", Accuracy= " + "{:.6f}".format(epoch_acc))
#                     sys.stdout.flush()
#                     if val_epoch == self.config.epoch_per_val:
#                         val_epoch = 0
#                         val_accuracy = ValidationError('val')
#                         if self.config.run_type=='classification':
#                             if best_val_accuracy<val_accuracy:
#                                 best_val_epoch=total_epoch
#                                 best_val_accuracy= val_accuracy
#                                 SaveWeights()
#                         if self.config.run_type == 'regression':
#                             if best_val_accuracy>val_accuracy:
#                                 best_val_epoch=total_epoch
#                                 best_val_accuracy= val_accuracy
#                                 SaveWeights()

#                         print('*'*30)
#                         print(str(self.config.run_type)+' validation accuracy at epoch %d: %f'%(total_epoch, val_accuracy))
#                         print('best validation accuracy so far at epoch %d: %f'%(total_epoch, best_val_accuracy))
#                         print('*'*30)
#                 print("Optimization Finished!")

#                 saver.restore(sess, './'+self.config.weight_dir+'/')
#                 print('weights restored...')
#                 test_accuracy = ValidationError('test')
#                 print('test accuracy', test_accuracy)
#                 return test_accuracy


#     def __init__(self, config):
#       self.config = config
# #       if len(self.config.marker_list) == 0: self.read_markers(self.config.train_data_dir)
# #       self.config_cur_marker=self.config.marker_list[0]
# #       self.read_train_file(self.config.train_data_dir)
#       self.read_train_file()
#       self.add_placeholders()
#       self.init_variables()
#       self.val_data=[]

# def run_regression(config=None, stack=1, attention=False, res=False):
#       for i in range(stack):
#             ts = int(time.time())
#             if config is None:
#                 config = Config()
#             config.run_type='regression'
#             config.train_data_dir='Data/train/regression'
#             config.val_data_dir='Data/val/regression'
#             config.test_data_dir='Data/test/regression'

#             config.cell_type='LSTM'
#             #config.cell_type='GRU'
#             config.model="BiRNN"
#             #config.model="RNN"
#             config.learning_rate=0.001
#             config.batch_size=16
#             config.batches_per_epoch=80
#             config.max_epochs=40
#             config.dropout=1
#             config.hidden_dim=300
#             config.epoch_per_val=5
#             config.stack=i+1
#             config.attention=attention
#             #config.init='norm'
#             config.init='xaiver'
#             config.grad_clip_threshold = 10000
#             config.residual=res

#             config.weight_dir="regression_"+str(config.model)+"_"+str(config.cell_type)+"_"+"stack"+str(config.stack)+"_"+str(ts)
#             if not os.path.exists("./"+config.weight_dir):
#               os.makedirs("./"+config.weight_dir)
#             f=open("./"+config.weight_dir+'/run_file.txt', 'a')
#             sys.stdout = f
#             model = Models(config)
#             loss_val = model.run_model(scope=str(i))

# def run_2classification(config=None,stack=1, attention=False, res=False):
#     for i in range(5,8):
#             ts = int(time.time())
#             if config is None:
#               config = Config()
#             config.run_type='classification'
#             config.train_data_dir='Data/train/2_classification'
#             config.val_data_dir='Data/val/2_classification'
#             config.test_data_dir='Data/test/2_classification'
#             config.cell_type='LSTM'
#             #config.cell_type='GRU'
#             config.model="BiRNN"
#             #config.model="RNN"
#             config.learning_rate=0.001
#             config.batch_size=16
#             config.batches_per_epoch=80
#             config.max_epochs=40
#             config.dropout=1
#             config.hidden_dim=300
#             config.epoch_per_val=5
#             config.stack=i+1
#             config.attention=attention
#             #config.init='norm'
#             config.init='xaiver'
#             config.grad_clip_threshold = 10000
#             config.residual=res

#             config.weight_dir="attention_2_classification_"+str(config.model)+"_"+str(config.cell_type)+"_"+"stack"+str(config.stack)+"_"+str(ts)
#             if not os.path.exists("./"+config.weight_dir):
#               os.makedirs("./"+config.weight_dir)
#             f=open("./"+config.weight_dir+'/run_file.txt', 'a')
#             sys.stdout = f
#             model = Models(config)
#             loss_val = model.run_model(scope=str(i))

# #def run_3classification(config=None,stack=1, attention=False):
# #      for i in range(stack):
# #            ts = int(time.time())
# #            ts = int(time.time())
# #            if config is None:
# #              config = Config()
# #
# #            config.run_type='classification'
# #            config.train_data_dir='Data/train/3_classification'
# #            config.val_data_dir='Data/val/3_classification'
# #            config.test_data_dir='Data/test/3_classification'
# #            config.weight_dir="3_classification_"+str(ts)
# #            config.cell_type='LSTM'
# #            #config.cell_type='GRU'
# #            config.model="BiRNN"
# #            #config.model="RNN"
# #            config.learning_rate=0.001
# #            config.batch_size=128
# #            config.batches_per_epoch=5
# #            config.max_epochs=30
# #            config.dropout=0.8
# #            config.hidden_dim=300
# #            config.epoch_per_val=5
# #            config.stack=i+1
# #            config.attention=attention
# #            #config.init='norm'
# #            config.init='xaiver'
# #            config.grad_clip_threshold = 10000
# #
# #
# #            config.weight_dir="3_classification_"+str(config.model)+"_"+str(config.cell_type)+"_"+"stack"+str(config.stack)+"_"+str(ts)
# #            if not os.path.exists("./"+config.weight_dir):
# #              os.makedirs("./"+config.weight_dir)
# #            f=open("./"+config.weight_dir+'/run_file.txt', 'a')
# #            sys.stdout = f
# #            model = Models(config)
# #            loss_val = model.run_model(scope=str(i))

# if __name__ == "__main__":
#     random.seed(31415)
#     print(sys.argv[1])
#     if sys.argv[1] == '2_classification':
#         run_2classification(stack=1, attention=False)
#     elif sys.argv[1] == '3_classification':
#         run_3classification(stack=4, attention=True)
#     elif sys.argv[1] == 'regression':
#         run_regression(stack=1, attention=False)
#     elif sys.argv[1]=='stack_regression':
#         run_regression(stack=7, attention=True, res=True)
#     elif sys.argv[1]=='stack_classification':
#         run_2classification(stack=7, attention=True, res=True)
#     else:
#         print('you must select a task to run')

In [27]:
# run_regression()

### Weight collector

In [28]:
# import os
# from itertools import izip
# import operator
# import csv
# import numpy as np
# #this is a hack file, assumed a lot of things, such as naming conventions and implied squence
# wordfile=[]
# attention_file =[]
# correctness_file =[]
# class_file=[]
# pos_word_weight_dic={}
# neg_word_weight_dic={}
# pos_freq={}
# neg_freq={}
# #cause the way the files are named, not only i grab the right one, they are also aling, such as word_0 aligns with attention_0 and correct_0 and class_0
# files = [f for f in os.listdir('.') if os.path.isfile(f)]
# for f in files:
#     if f.startswith('word'):
#         wordfile.append(f)
#     if f.startswith('attention'):
#         attention_file.append(f)
#     if f.startswith('correct'):
#         correctness_file.append(f)
#     if f.startswith('y'):
#         class_file.append(f)
# print wordfile
# print attention_file
# print correctness_file
# print class_file
# for i, wf in enumerate(wordfile):
#     print i, wf
#     with open(wf) as word_f, open(attention_file[i]) as attention_f, open(correctness_file[i]) as correct_f, open(class_file[i]) as class_f:
#         correctness = [line.rstrip('\n') for line in correct_f]
#         word=[line.rstrip('\n') for line in word_f]
#         attention=[line.rstrip('\n') for line in attention_f]
#         classes=[line.rstrip('\n') for line in class_f]
#         print len(attention)
#         for i in correctness:
#             word_count = 0
#             while word_count < 250:
#                 #if i == "True":
#                     x = word[word_count]
#                     y = attention[word_count]
#                     if classes[word_count]=="10":
#                         if x in neg_word_weight_dic:
#                                neg_word_weight_dic[x]-=float(y)
#                                neg_freq[x]+=1
#                         else:
#                                neg_freq[x]=1
#                                neg_word_weight_dic[x]=-1.0*float(y)
#                     if classes[word_count]=="01":
#                         if x in pos_word_weight_dic:
#                                pos_word_weight_dic[x]+=float(y)
#                                pos_freq[x]+=1
#                         else:
#                                pos_freq[x]=1
#                                pos_word_weight_dic[x]=float(y)
#                     word_count +=1
#                 #print word_weight_dic
#                 #print freq
# freq_t=0.0
# for word in neg_freq:
#     freq_t+=neg_freq[word]
# for word in pos_freq:
#     freq_t+=pos_freq[word]
# for word in pos_word_weight_dic:
#     freq_c = pos_freq[word]
#     if word in neg_word_weight_dic:
#         pos_word_weight_dic[word] += neg_word_weight_dic[word]
#         freq_c+=neg_freq[word]
#     #pos_word_weight_dic[word] /=np.abs(freq_c*(1+np.log(freq_c/freq_t)))
#     #pos_word_weight_dic[word] /=np.abs(freq_c*np.log(freq_c/freq_t))
#     pos_word_weight_dic[word] /=freq_c
# #for word in neg_word_weight_dic:
# #    neg_word_weight_dic[word] /=(1+100*np.log(neg_freq[word]))

# pos_sorted_word = sorted(pos_word_weight_dic.items(), key=operator.itemgetter(1), reverse=True)
# #neg_sorted_word = sorted(neg_word_weight_dic.items(), key=operator.itemgetter(1), reverse=True)
# #print pos_sorted_word
# #print neg_sorted_word

# w = csv.writer(open("output.csv", "w"))
# for (key, val) in pos_sorted_word:
#     w.writerow([key, val])

# #w = csv.writer(open("neg_output.csv", "w"))
# #for (key, val) in neg_sorted_word:
# #    w.writerow([key, val])

# print len(pos_freq)
# print len(pos_word_weight_dic)

# #print len(neg_freq)
# #print len(neg_word_weight_dic)


## Alternate: LSTM Only

In [29]:
# !pip install keras
# !pip install pandas_ml

# !pip install --upgrade pip

In [96]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Embedding, Dropout, Dense, LSTM
from keras.callbacks import CSVLogger, History, ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc, accuracy_score, precision_recall_fscore_support
from pandas_ml import ConfusionMatrix
%matplotlib inline

class YelpLSTM(object):
    def __init__(self, parms):
        self._parms = parms
#         self._tokenizer = Tokenizer(nb_words=self._parms['vocabulary_size'])
        self._tokenizer = Tokenizer(num_words=self._parms['vocabulary_size'])


        self._reviews = None
        self._balanced = None
        self._glove = None
        self._embedding_matrix = None
        self._model = None
        self._verbose = True
        self._predicted_classes = None
        self._predicted_proba = None
        self._eval_actual = None
        self._eval_predicted_proba = None
        self._eval_predicted_classes = None
        self._logs = None
        self._tpr = None
        self._fpr = None
        self._thresholds = None
        self._auc = None
        self._target_range = None
        
    def console(self, message):
        if self._verbose:
            print(message)
            
    def update_parms(self, parms):
        if parms['vocabulary_size'] != self._parms['vocabulary_size']:
            self._tokenizer = Tokenizer(nb_words=parms['vocabulary_size'])
        self._parms = parms
        
#     def load_reviews(self, reviews):
    def load_reviews(self, reviews_path):
        self.console('Loading reviews...')
#         self._reviews = pd.read_csv(reviews)


#         print("reviews test before load", reviews[0:10])

#         self._reviews = reviews



#         self._reviews = pd.read_json(reviews_path, lines=True)
        self._reviews = pd.read_json(reviews_path, lines=True)

#         self._reviews = pd.read_json("../dataset/restaurant_reviews_10k.json", lines=True)

#         self._reviews = pd.read_json("../dataset/restaurant_reviews_10k.json", lines=True)
#         self._reviews = reviews_df




        self.console('%d reviews loaded.' % len(self._reviews))
#         self.console('%d reviews loaded.' % np.shape(self._reviews))
        
        
        
        
#         self.console(self._reviews[0:10])
        
        
#     def load_glove(self, glove_folder):
    def load_glove(self, gloveFile):
        self.console('Loading GloVe embeddings...')
        glove = {}
        count = 0
#         with open(os.path.join(glove_folder, 'glove.6B.' + str(self._parms['embedding_dim']) + 'd.txt'), 'r') as f:

        with open(gloveFile, 'r') as f:
            while True:
                line = f.readline()
                if not line:
                    break
                line = line.split(' ')
                word = line[0]
                vector = np.asarray(line[1:], dtype='float32')
                glove[word] = vector
        self._glove = glove
        self.console('%d embeddings loaded.' % len(self._glove))
        
    @property
    def training(self):
        return self._X_train, self._y_train
    
    @property
    def test(self):
        return self._X_test, self._y_test
    
    @property
    def best_model(self):
        return self._model
    
    @property
    def predicted_classes(self):
        return self._predicted_classes
    
    @property
    def predicted_proba(self):
        return self._predicted_proba
    
    @property
    def tokenizer(self):
        return self._tokenizer
    
    @property
    def logs(self):
        return self._logs
        
    @property
    def confusion_matrix(self):
        return self._cm
    
    @property
    def prfs(self):
        return self._prfs
    
    @property
    def fpr(self):
        return self._fpr
    
    @property
    def tpr(self):
        return self._tpr
    
    @property
    def thresholds(self):
        return self._thresholds
    
    @property
    def auc(self):
        return self._auc
        
    def _balance_dataset(self):
        categories = []
        samples = []
                
        self._target_range = range(2)
        if self._parms['target']['feature'] == 'stars':
            prefix = ''
            self._target_range = range(1,6)
        else:
            prefix = 'is_'
            self._reviews['is_' + self._parms['target']['feature']] = self._reviews[self._parms['target']['feature']].apply(lambda v: v > self._parms['target']['threshold']).astype(int)
            
        for i in self._target_range:
            categories.append(self._reviews[self._reviews[prefix + self._parms['target']['feature']] == i])
            
        
        
#         sizes = map(lambda s: len(s), categories)
        
        sizes = list(map(lambda s: len(s), categories))
        
        
#         print(sizes)
        
        
        nb_samples = min(self._parms['samples'], np.min(sizes))
#         nb_samples = np.min(sizes)
        
        
        
#         nb_samples = nb_samples.astype(np.int32)
        
        
        
        
        
        
        self.console('Using %s samples per category' % str(nb_samples))
        
        
        
#         self.console('Type for nb_samples' % str(type(nb_samples)))
        
        
        
        
        for category in categories:
            samples.append(category.sample(n=nb_samples, random_state=32))
        self._balanced = pd.concat(samples)

    def _build_datasets(self):
        self._tokenizer.fit_on_texts(self._balanced.text.values)
        
        sequences = self._tokenizer.texts_to_sequences(self._balanced.text)
        padded_seq = pad_sequences(sequences, maxlen=self._parms['seq_size'])

        if self._parms['target']['feature'] == 'stars':
            target = to_categorical(self._balanced[self._parms['target']['feature']])
        else:
            target = self._balanced['is_' + self._parms['target']['feature']].values

        self._X_train, self._X_test, self._y_train, self._y_test = train_test_split(padded_seq, target, test_size=0.2, random_state=42)

    def _build_embeddings(self):
        tokenized_words = map(lambda t: t[0], sorted(self._tokenizer.word_index.items(), key=lambda t: t[1])[:self._parms['vocabulary_size']])

        embedding_matrix = np.zeros((self._parms['vocabulary_size'], self._parms['embedding_dim']))
        for idx, word in enumerate(tokenized_words):
            try:
                embedding_matrix[idx] = self._glove[word]
            except:
                pass
        self._embedding_matrix = embedding_matrix

    def _build_model(self):
        model = Sequential()

        model.add(Embedding(input_dim=self._parms['vocabulary_size'],
                            output_dim=self._parms['embedding_dim'],
                            input_length=self._parms['seq_size'],
                            weights=[self._embedding_matrix],
                            trainable=False))

        model.add(LSTM(self._parms['memory_neurons']))
        model.add(Dropout(0.2))

        activation = 'sigmoid'
        loss = 'binary_crossentropy'
        outputs = 1
        if len(self._y_train.shape) > 1:
            activation = 'softmax'
            loss = 'categorical_crossentropy'
            outputs = self._y_train.shape[1]

        model.add(Dense(outputs, activation=activation))
        model.compile(loss=loss, optimizer='nadam', metrics=['accuracy'])
        self._model = model
        self.console(self._model.summary())

    def fit(self, model_name, folder='./', verbose=True):
        self._verbose = verbose
        
        assert self._reviews is not None, 'Reviews file was not loaded'
        assert len(self._reviews) > 0, 'Reviews file is empty'
        assert self._glove is not None, 'GloVe file was not loaded'
        assert len(self._glove) > 0, 'GloVe file is empty'
        
        self.console('Balancing dataset...')
        self._balance_dataset()
        self.console('Building training and test datasets...')
        self._build_datasets()
        self.console('Building word embeddings from GloVe...')
        self._build_embeddings()
        self.console('Building model...')
        self._build_model()
        self.console('Fitting model...')
        
        parms_desc = model_name + '_%ddim_%dvoc_%dseq' % (self._parms['embedding_dim'],
                                                          self._parms['vocabulary_size'],
                                                          self._parms['seq_size'])

        hist = History()
        
        
#         logger = CSVLogger(os.path.join(folder, parms_desc) + '_training_logs.csv')
        logger = CSVLogger('_training_logs.csv')
        
        
        
        
#         checks = ModelCheckpoint(os.path.join(folder, parms_desc) + '_model-{epoch:02d}_{val_acc:.2f}',
        checks = ModelCheckpoint('_model-{epoch:02d}_{val_acc:.2f}',
                                 monitor='val_acc',
                                 verbose=int(self._verbose),
                                 save_best_only=True,
                                 mode='max')
        early_stopping = EarlyStopping(monitor='val_acc', patience=2)

        self._model.fit(self._X_train,
                        self._y_train,
                        nb_epoch=self._parms['nb_epochs'],
#                         epoch=self._parms['epochs'],
                        batch_size=self._parms['batch_size'],
                        validation_data=(self._X_test, self._y_test),
                        callbacks=[checks, hist, logger, early_stopping])
        
#         self._logs = pd.read_csv(os.path.join(folder, parms_desc) + '_training_logs.csv')
        self._logs = pd.read_csv('_training_logs.csv')
        best_epoch = self._logs['val_acc'].argmax()
        best_val_acc = '{:.2f}'.format(self._logs['val_acc'].iloc[best_epoch])
#         best_model = (os.path.join(folder, parms_desc) + '_model-%02d_%s') % (best_epoch, best_val_acc)
        best_model = ('_model-%02d_%s') % (best_epoch, best_val_acc)
        
#         with open(os.path.join(folder, parms_desc + '_tokenizer'), 'wb') as tok:
        with open('_tokenizer', 'wb') as tok:
            pickle.dump(self._tokenizer, tok)
        
        self.console('Calculating predictions for the best model...')
        self._model = load_model(best_model)
        self._predicted_proba = self.predict_proba()
        if len(self._y_train.shape) > 1:
            self._predicted_classes = np.argmax(self._predicted_proba, axis=1)
        else:
            self._predicted_classes = (self._predicted_proba > 0.5).astype(int)
        self.console('Calculating metrics for the best model...')
        self.evaluate()
        self.console('Finished!')
        
        return self._model

    def load(self, tokenizer, model):
        error_msg = ''
        try:
            self._model = load_model(model)
        except:
            error_msg = 'Error loading model!'
            
        try:
            with open(tokenizer, 'rb') as tok:
                self._tokenizer = pickle.load(tok)
        except:
            error_msg = 'Error loading tokenizer!'
            
        return (error_msg == ''), error_msg
    
    def make_prediction(self, sentence):
        sequence = self._tokenizer.texts_to_sequences([sentence])
        padded_seq = pad_sequences(sequence, maxlen=self._parms['seq_size'])
        return self.predict_classes(padded_seq)[0]
    
    def predict_classes(self, X=None, threshold=0.5):
        if len(self._y_train.shape) > 1:
            predictions = np.argmax(self.predict_proba(X), axis=1)
        else:
            predictions = (self.predict_proba(X) > threshold).astype(int)
        return predictions
        
    def predict_proba(self, X=None):
        if X is None:
            X = self._X_test
        predictions = self._model.predict_proba(X)
        return predictions
    
    def evaluate(self, actual=None, predicted_proba=None, threshold=0.5):
        if actual is None:
            eval_actual = self._y_test[:]
        else:
            eval_actual = actual[:]
            
        if predicted_proba is None:
            eval_predicted_proba = self._predicted_proba[:]
        else:
            eval_predicted_proba = predicted_proba[:]
            
        if len(eval_actual.shape) == 1:
            binary = True
            eval_predicted_classes = (eval_predicted_proba > threshold).astype(int).ravel()
            eval_predicted_proba = eval_predicted_proba.ravel()
        else:
            binary = False
            eval_predicted_classes = eval_predicted_proba.argmax(axis=1)
            eval_actual = eval_actual.argmax(axis=1)
        
        self._eval_actual = eval_actual
        self._eval_predicted_proba = eval_predicted_proba
        self._eval_predicted_classes = eval_predicted_classes
    
        self._cm = ConfusionMatrix(self._eval_actual, self._eval_predicted_classes)

        prfs = precision_recall_fscore_support(y_true=self._eval_actual, y_pred=self._eval_predicted_classes)
        prfs = pd.DataFrame.from_dict(dict(zip(['precision', 'recall', 'fscore', 'support'], prfs)))
        
#         prfs.set_index([self._target_range], inplace=True)
        
        self._prfs = prfs
        
        if binary:
            self._fpr, self._tpr, self._thresholds = roc_curve(self._eval_actual, self._eval_predicted_proba)
            self._auc = auc(self._fpr, self._tpr)
        else:
            self._fpr, self._tpr, self._thresholds, self._auc = None, None, None, None

In [86]:
# reviews = './dataset/english_reviews_sample.csv'


# reviews_df = pd.read_json("../dataset/restaurant_reviews_10k.json", lines=True)

# reviews_path = "../dataset/restaurant_reviews_10k.json"


# print(reviews_df[0:10])
# print(reviews_df.head())

# reviews_df.head()
# print(np.shape(reviews_df))

# glove_folder = './glove.6B'

In [97]:
parms = {'embedding_dim': 100,
         'vocabulary_size': 10000,
         'seq_size': 400,
         'nb_epochs': 30,
#          'epochs': 30,
         'batch_size': 128,
         'memory_neurons': 100,
         'target': {'feature': 'stars', 'threshold': None},
         'samples': 62500}

lstm = YelpLSTM(parms)

In [98]:
# test = lstm.load_reviews(reviews_df)
# print(test)
# reviews_path = "../dataset/review_10k.json"

reviews_path = "../dataset/restaurant_reviews_10k.json"
lstm.load_reviews(reviews_path)



Loading reviews...
10000 reviews loaded.


In [99]:
# # Smallest GloVe file
# gloveFile = "../../glove/glove.6B.50d.txt"

# # Smaller GloVe file
gloveFile = "../../glove/glove.6B.300d.txt"

# # Primary GloVe file
# gloveFile = "../../glove/glove.42B.300d.txt



start_time = datetime.datetime.now()
print("Start time: ", start_time)

lstm.load_glove(gloveFile)

end_time = datetime.datetime.now()
print("End time: ", end_time)

time_taken = end_time - start_time
print("Time taken: ", time_taken)

Start time:  2018-08-02 17:25:49.827405
Loading GloVe embeddings...
400000 embeddings loaded.
End time:  2018-08-02 17:26:21.092899
Time taken:  0:00:31.265494


In [100]:
# model = lstm.fit(model_name='stars_100neurons', folder='./models/stars')
model = lstm.fit(model_name='stars_100neurons', folder='./')

Balancing dataset...
Using 965 samples per category
Building training and test datasets...
Building word embeddings from GloVe...
Building model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 400, 100)          1000000   
_________________________________________________________________
lstm_7 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_7 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 6)                 606       
Total params: 1,081,006
Trainable params: 81,006
Non-trainable params: 1,000,000
_________________________________________________________________
None
Fitting model...




Train on 3860 samples, validate on 965 samples
Epoch 1/30

Epoch 00001: val_acc improved from -inf to 0.17824, saving model to _model-01_0.18
Epoch 2/30

Epoch 00002: val_acc improved from 0.17824 to 0.22176, saving model to _model-02_0.22
Epoch 3/30

Epoch 00003: val_acc did not improve from 0.22176
Epoch 4/30

Epoch 00004: val_acc did not improve from 0.22176
Calculating predictions for the best model...




OSError: Unable to open file (unable to open file: name = '_model-01_0.22', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)