In [1]:
import tensorflow_hub as hub
import tensorflow as tf

elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

  from ._conv import register_converters as _register_converters


In [2]:
# just a random sentence
x = ["Roasted ants are a popular snack in Columbia"]

# Extract ELMo features 
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

embeddings.shape

TensorShape([Dimension(1), Dimension(8), Dimension(1024)])

In [1]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle
pd.set_option('display.max_colwidth', 200)

In [61]:
def run(name, load = False):
    # read data
    train = pd.read_csv("train-"+name+".csv")
    test = pd.read_csv("test-"+name+".csv")

    if load:
        train.shape, test.shape
        train['label'].value_counts(normalize = True)
        # remove URL's from train and test
        train['clean_tweet'] = train['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

        test['clean_tweet'] = test['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

        # remove punctuation marks
        punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'

        train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
        test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

        # convert text to lowercase
        train['clean_tweet'] = train['clean_tweet'].str.lower()
        test['clean_tweet'] = test['clean_tweet'].str.lower()

        # remove numbers
        train['clean_tweet'] = train['clean_tweet'].str.replace("[0-9]", " ")
        test['clean_tweet'] = test['clean_tweet'].str.replace("[0-9]", " ")

        # remove whitespaces
        train['clean_tweet'] = train['clean_tweet'].apply(lambda x:' '.join(x.split()))
        test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ' '.join(x.split()))

        # import spaCy's language model
        nlp = spacy.load('en', disable=['parser', 'ner'])

        # function to lemmatize text
        def lemmatization(texts):
            output = []
            for i in texts:
                s = [token.lemma_ for token in nlp(i)]
                output.append(' '.join(s))
            return output
        def elmo_vectors(x):
            embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]
            print('---')
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                sess.run(tf.tables_initializer())
                # return average of ELMo features
                return sess.run(tf.reduce_mean(embeddings,1))
        list_train = [train[i:i+100] for i in range(0,train.shape[0],100)]
        list_test = [test[i:i+100] for i in range(0,test.shape[0],100)]

        # Extract ELMo embeddings
        elmo_train = [elmo_vectors(x['clean_tweet']) for x in list_train]
        elmo_test = [elmo_vectors(x['clean_tweet']) for x in list_test]

        elmo_train_new = np.concatenate(elmo_train, axis = 0)
        elmo_test_new = np.concatenate(elmo_test, axis = 0)

        # save elmo_train_new
        pickle_out = open("elmo_train_03032019.pickle","wb")
        pickle.dump(elmo_train_new, pickle_out)
        pickle_out.close()

        # save elmo_test_new
        pickle_out = open("elmo_test_03032019.pickle","wb")
        pickle.dump(elmo_test_new, pickle_out)
        pickle_out.close()
    
    # load elmo_train_new
    pickle_in = open("elmo_train_03032019.pickle", "rb")
    elmo_train_new = pickle.load(pickle_in)

    # load elmo_train_new
    pickle_in = open("elmo_test_03032019.pickle", "rb")
    elmo_test_new = pickle.load(pickle_in)

    from sklearn.model_selection import train_test_split

    xtrain, xvalid, ytrain, yvalid = train_test_split(elmo_train_new, 
                                                      train['label'],  
                                                      random_state=42, 
                                                      test_size=0.2)

    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import f1_score

    lreg = LogisticRegression()
    lreg.fit(xtrain, ytrain)
    preds_valid = lreg.predict(xvalid)
    f1_score(yvalid, preds_valid)
    print(f1_score)
#     return yvalid, preds_valid

In [62]:
run('p')

[[-3.9126039e-02  5.2388992e-02  4.6263330e-02 ...  8.5774278e-03
  -4.8353793e-03  6.7811199e-02]
 [-1.0902258e-01 -2.1525523e-02 -5.2172270e-02 ... -6.1512440e-02
  -6.8826802e-02 -4.0342208e-02]
 [-7.3851429e-02  1.4869389e-02  3.8856074e-02 ... -1.6723890e-02
  -8.2654923e-02  1.2227881e-04]
 ...
 [-5.7972725e-02 -7.6848440e-02 -1.4128958e-01 ...  4.3946747e-02
  -6.2106684e-02  7.8684591e-02]
 [-1.6908251e-01 -4.5111772e-01 -2.1080220e-02 ... -8.8675328e-02
   2.2013360e-01  2.4340060e-01]
 [-5.7666406e-02 -6.4698242e-02  8.1019267e-02 ...  5.5339698e-02
  -3.3359792e-02  3.2166727e-02]]




<function f1_score at 0x0000022D46681E18>


In [49]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
def run_test(name):
    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv")
    
    # load elmo_train_new
    pickle_in = open("elmo_train_03032019.pickle", "rb")
    elmo_train_new = pickle.load(pickle_in)

    # load elmo_train_new
    pickle_in = open("elmo_test_03032019.pickle", "rb")
    elmo_test_new = pickle.load(pickle_in)
    
    from sklearn.cluster import KMeans
    from sklearn.model_selection import train_test_split

    xtrain, xvalid, ytrain, yvalid = train_test_split(elmo_train_new, 
                                                      train['label'],  
                                                      random_state=42, 
                                                      test_size=0.2)

    from sklearn.metrics import f1_score

    lreg = KMeans(n_clusters=5)
    lreg.fit(elmo_train_new)
    preds_valid = lreg.predict(elmo_test_new)
    return preds_valid

In [50]:
run_test('v')

array([4, 1, 4, 4, 2, 3, 2, 1, 0, 1, 1, 3, 3, 1, 4, 4, 4, 3, 4, 4, 1, 4,
       1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 1, 3, 4, 4, 4, 4, 4, 4, 4, 4, 1,
       4, 4, 4, 1, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 1, 3, 1, 4, 4, 4, 4, 1,
       1, 4, 4, 1, 4, 4, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 1,
       4, 4, 0, 1, 1, 1, 1, 4, 4, 0, 4, 1, 4, 4, 1, 4, 4, 3, 0, 4, 0, 4,
       4, 4, 1, 4, 4, 1, 4, 0, 1, 1, 3, 4, 4, 1, 0, 0, 1, 1, 4, 1, 4, 4,
       4, 0, 1, 2, 4, 1, 1, 4, 4, 4, 1, 1, 1, 4, 0, 1, 1, 1, 4, 3, 1, 4,
       4, 4, 4, 1, 4, 4, 4, 4, 1, 4, 1, 4, 1, 4, 0, 4, 2, 3, 4, 1, 4, 4,
       4, 4, 0, 3, 4, 1, 4, 1, 1, 4, 4, 4, 4, 0, 4, 3, 3, 4, 3, 4, 4, 4,
       3, 3, 1, 1, 0, 4, 1, 3, 1, 1, 1, 3, 4, 4, 4, 1, 2, 4, 1, 3, 3, 4,
       0, 0, 3, 0, 0, 4, 2, 0, 3, 0, 4, 4, 0, 0, 1, 4, 0, 4, 1, 4, 4, 4,
       4, 2, 0, 4, 4, 0, 4, 1, 1, 4, 4, 4, 4, 4, 4, 1, 4, 1, 1, 3, 4, 1,
       4, 1, 4, 1, 4, 4, 1, 4, 1, 0, 0, 1, 1, 1, 0, 4, 3, 4, 1, 4, 4, 1,
       0, 4, 0, 3, 4, 4, 4, 1, 4, 1, 4, 1, 3, 0, 1,

In [38]:
y, Y = run_test('v')
print(classification_report(y, Y))
# confusion_matrix(y, Y)

ValueError: too many values to unpack (expected 2)

In [121]:
def elmo_vectors(x):
    embeddings = elmo(x, signature="default", as_dict=True)["elmo"]
    print('---')
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        # return average of ELMo features
        return sess.run(tf.reduce_mean(embeddings,1))

In [131]:
a = elmo_vectors(['i', 'love', 'you'])

---


In [132]:
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
tokens_input = [["the", "cat", "is", "on", "the", "mat"],
                ["dogs", "are", "in", "the", "fog", ""]]
tokens_length = [6, 5]
embeddings = elmo(
    inputs={
        "tokens": tokens_input,
        "sequence_len": tokens_length
    },
    signature="tokens",
    as_dict=True)["elmo"]

In [178]:
def vektor(x):
    elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
    embeddings = elmo(x,
        signature="default",
        as_dict=True)["elmo"]

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        # return average of ELMo features
        return sess.run(tf.reduce_mean(embeddings,1))

In [248]:
x = ["A battery is a device consisting of one or more electrochemical cells with external connections provided to power electrical devices such as flashlights, smartphones, and electric cars. When a battery is supplying electric power, its positive terminal is the cathode and its negative terminal is the anode. The terminal marked negative is the source of electrons that will flow through an external electric circuit to the positive terminal. When a battery is connected to an external electric load, a redox reaction converts high-energy reactants to lower-energy products, and the free-energy difference is delivered to the external circuit as electrical energy. Historically the term battery specifically referred to a device composed of multiple cells, however the usage has evolved to include devices composed of a single cell."]
x = vektor(x)

In [249]:
y = ["A display device is an output device for presentation of information in visual or tactile form (the latter used for example in tactile electronic displays for blind people). When the input information that is supplied has an electrical signal the display is called an electronic display."]
y = vektor(y)

In [250]:
z = ["A central processing unit (CPU), also called a central processor or main processor, is the electronic circuitry within a computer that carries out the instructions of a computer program by performing the basic arithmetic, logic, controlling, and input/output (I/O) operations specified by the instructions. The computer industry has used the term central processing unit at least since the early 1960s. Traditionally, the term CPU refers to a processor, more specifically to its processing unit and control unit (CU), distinguishing these core elements of a computer from external components such as main memory and I/O circuitry."]
z = vektor(z)

In [195]:
from scipy import spatial
def cosine(x, y):
    dataSetI = x
    dataSetII = y
    return 1 - spatial.distance.cosine(dataSetI, dataSetII)

In [251]:
word1 = ['power']
w1 = vektor(word1)

In [252]:
word2 = ['screen']
w2 = vektor(word2)

In [253]:
import  numpy as np
def get_score(base, word):
    k = []
    for a in base:
        k.append(cosine(a, word))
    return np.mean(k)

In [2]:
# load elmo_train_new
pickle_in = open("elmo_test_03032019.pickle", "rb")
elmo_test_new = pickle.load(pickle_in)

In [4]:
elmo_test_new[0]

array([-0.03912604,  0.05238899,  0.04626333, ...,  0.00857743,
       -0.00483538,  0.0678112 ], dtype=float32)