In [17]:
import numpy as np
import pandas as pd
import os
import string
import tensorflow as tf
print(tf.__version__)

1.10.0


In [2]:
pd.set_option('display.max_colwidth', -1)

In [3]:
datadir = os.path.join('input', '99bitcoins')
glove_wordvectors = os.path.join('input', 'glove.6B', 'glove.6B.50d.txt')

In [4]:
main_df = pd.read_csv(os.path.join(datadir, '99bitcoins_main.csv'))

I'm trying to recreate https://github.com/adeshpande3/LSTM-Sentiment-Analysis/blob/master/Oriole%20LSTM.ipynb
using the trained word vector files from https://nlp.stanford.edu/projects/glove/. It's not clear where or what 'wordsList.npy', used in the first link, is, so I'm recreating the numpy arrays from the available txt file.

In [None]:
"""
np.loadtext and np.genfromtxt handle special characters poorly (most likely quotation marks), and fail
to skip over columns even when they are excluded using the usecols parameter. This causes rows to be wrongly delimited

The workaround, in the following cell, uses pandas.read_csv to work around this problem, by specifying an obscure quotechar
parameter, so that quotation marks and other characters are not mistaken for quotechars, interrupting parsing.
"""
wordvectors = np.genfromtxt(glove_wordvectors, 
                            usecols=tuple(range(1,51)), 
                            encoding='UTF=8', 
                            dtype=None)

In [5]:
wordvectors_df = pd.read_csv(glove_wordvectors, names=list(range(0,51)), sep=' ', engine='python', quotechar='\x07')

In [6]:
wordvectors_df.shape

(400000, 51)

In [7]:
wordvectors = wordvectors_df.iloc[:,1:51].values
wordlist = list(wordvectors_df.iloc[:,0].values)

In [8]:
wordvectors[np.where(wordlist == 'fried')]

array([], shape=(0, 50), dtype=float64)

In [9]:
count_df= pd.DataFrame([len(x.split(' ')) for x in main_df['event_maintext']], columns=['wordcount'])

In [10]:
count_df.reset_index().groupby('wordcount').agg('count').tail()

Unnamed: 0_level_0,index
wordcount,Unnamed: 1_level_1
137,2
138,1
141,2
153,1
169,1


In [11]:
max_sentence_len = 170
num_dimensions = 50

In [12]:
def get_word_index(word, word_index_list):
    try:
        return word_index_list.index(word.lower())
    except Exception as e:
        return 0

In [13]:
first_sentence = main_df['event_maintext'][0].split(' ')
print(first_sentence)
padding = [0] * (max_sentence_len - len(first_sentence))
first_sentence_integerized = [get_word_index(x, wordlist) for x in first_sentence] + padding
print(first_sentence_integerized)

['The', 'U.S.', 'Commodity', 'Futures', 'Trading', 'Commission', '(CFTC)', 'has', 'sent', 'subpoenas', 'to', 'four', 'crypto-exchanges—Bitstamp,', 'Kraken,', 'ItBit,', 'and', 'Coinbase—demanding', 'answers', 'on', 'the', 'subject', 'of', 'market', 'price', 'distortion.']
[0, 99, 7059, 3081, 857, 627, 0, 31, 688, 19976, 4, 133, 0, 0, 0, 5, 0, 5319, 13, 0, 1698, 3, 211, 626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [14]:
with tf.Session() as sess:
    print(tf.nn.embedding_lookup(wordvectors, first_sentence_integerized).eval().shape)

(170, 50)


In [31]:
def build_ids(sentence):
    lowercase_no_punctuation = ''.join([x.lower() for x in sentence if x not in string.punctuation])
    s_list = lowercase_no_punctuation.split(' ')
    
    padding = [0] * (max_sentence_len - len(sentence))
    sentence_integerized = [get_word_index(x, wordlist) for x in s_list] + padding
    
    return sentence_integerized

In [34]:
l2 = build_ids('What a wonderful world, huh?!')
l2

[102,
 7,
 5205,
 85,
 18364,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]