In [117]:
import tensorflow_hub as hub
import tensorflow as tf

elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [118]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle
pd.set_option('display.max_colwidth', 200)

In [119]:
def run(name, load = False):
    # read data
    train = pd.read_csv("Results/"+name+".csv")

    if not load:
        train.shape
        train['label'].value_counts(normalize = True)
        # remove URL's from train and test
        train['clean_tweet'] = train['review'].apply(lambda x: re.sub(r'http\S+', '', x))

        # remove punctuation marks
        punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'

        train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

        # convert text to lowercase
        train['clean_tweet'] = train['clean_tweet'].str.lower()

        # remove numbers
        train['clean_tweet'] = train['clean_tweet'].str.replace("[0-9]", " ")

        # remove whitespaces
        train['clean_tweet'] = train['clean_tweet'].apply(lambda x:' '.join(x.split()))

        # import spaCy's language model
        nlp = spacy.load('en', disable=['parser', 'ner'])

        # function to lemmatize text
        def lemmatization(texts):
            output = []
            for i in texts:
                s = [token.lemma_ for token in nlp(i)]
                output.append(' '.join(s))
            return output
        def elmo_vectors(x):
            embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]
            print('---')
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                sess.run(tf.tables_initializer())
                # return average of ELMo features
                return sess.run(tf.reduce_mean(embeddings,1))
        list_train = [train[i:i+100] for i in range(0,train.shape[0],100)]

        # Extract ELMo embeddings
        elmo_train = [elmo_vectors(x['clean_tweet']) for x in list_train]
        elmo_train_new = np.concatenate(elmo_train, axis = 0)

        # save elmo_train_new
        pickle_out = open("elmo_train_"+ name +".pickle","wb")
        pickle.dump(elmo_train_new, pickle_out)
        pickle_out.close()
    
    # load elmo_train_new
    print("elmo_train_"+ name +".pickle")
    pickle_in = open("elmo_train_"+ name +".pickle", "rb")
    elmo_train_new = pickle.load(pickle_in)


    from sklearn.model_selection import train_test_split

    xtrain, xvalid, ytrain, yvalid = train_test_split(elmo_train_new, 
                                                      train['label'],  
                                                      random_state=42, 
                                                      test_size=0.2)

    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn import svm

    from sklearn.metrics import f1_score

    lreg = LogisticRegression()
#     lreg = GaussianNB()
#     lreg = svm.LinearSVC()
    lreg.fit(xtrain, ytrain)
    preds_valid = lreg.predict(xvalid)
    return yvalid, preds_valid

In [120]:
from sklearn.metrics import classification_report, confusion_matrix

In [121]:
y, Y = run('FOOD', False)
print(classification_report(y, Y))

---
---
elmo_train_FOOD.pickle




              precision    recall  f1-score   support

    negative       1.00      0.29      0.44         7
    positive       0.85      1.00      0.92        29

   micro avg       0.86      0.86      0.86        36
   macro avg       0.93      0.64      0.68        36
weighted avg       0.88      0.86      0.83        36



In [122]:
y, Y = run('AMBIENCE', False)
print(classification_report(y, Y))

---
---
elmo_train_AMBIENCE.pickle




              precision    recall  f1-score   support

    negative       1.00      0.20      0.33         5
    positive       0.81      1.00      0.89        17

   micro avg       0.82      0.82      0.82        22
   macro avg       0.90      0.60      0.61        22
weighted avg       0.85      0.82      0.77        22



In [123]:
y, Y = run('SERVICE', False)
print(classification_report(y, Y))

---
---
elmo_train_SERVICE.pickle




              precision    recall  f1-score   support

    negative       1.00      0.40      0.57        10
    positive       0.71      1.00      0.83        15

   micro avg       0.76      0.76      0.76        25
   macro avg       0.86      0.70      0.70        25
weighted avg       0.83      0.76      0.73        25



In [124]:
y, Y = run('PRICES', False)
print(classification_report(y, Y))

---
elmo_train_PRICES.pickle




              precision    recall  f1-score   support

    negative       0.67      1.00      0.80         6
    positive       1.00      0.70      0.82        10

   micro avg       0.81      0.81      0.81        16
   macro avg       0.83      0.85      0.81        16
weighted avg       0.88      0.81      0.81        16

