In [2]:
! pip install -U bert-serving-server bert-serving-client
# https://github.com/mtala3t/Identify-the-Sentiments-AV-NLP-Contest

Collecting bert-serving-server
[?25l  Downloading https://files.pythonhosted.org/packages/b0/bd/cab677bbd0c5fb08b72e468371d2bca6ed9507785739b4656b0b5470d90b/bert_serving_server-1.10.0-py3-none-any.whl (61kB)
[K     |█████▎                          | 10kB 15.6MB/s eta 0:00:01[K     |██████████▋                     | 20kB 1.8MB/s eta 0:00:01[K     |████████████████                | 30kB 2.3MB/s eta 0:00:01[K     |█████████████████████▎          | 40kB 1.7MB/s eta 0:00:01[K     |██████████████████████████▋     | 51kB 1.9MB/s eta 0:00:01[K     |███████████████████████████████▉| 61kB 2.3MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 2.2MB/s 
[?25hCollecting bert-serving-client
  Downloading https://files.pythonhosted.org/packages/1f/09/aae1405378a848b2e87769ad89a43d6d71978c4e15534ca48e82e723a72f/bert_serving_client-1.10.0-py2.py3-none-any.whl
Collecting GPUtil>=1.3.0
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378c

In [0]:
import pandas as pd
import numpy as np
import spacy
import re
import pickle
import logging
from bert_serving.client import BertClient

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

pd.set_option('display.max_colwidth', 200)


In [0]:
# read data
train = pd.read_csv("/content/drive/My Drive/sentiment_ELMo/train_2kmZucJ.csv")
test = pd.read_csv("/content/drive/My Drive/sentiment_ELMo/test_oJQbWVk.csv")


In [7]:
print (train.shape, test.shape)
print (train['label'].value_counts())
print (train.head())


(7920, 3) (1953, 2)
0    5894
1    2026
Name: label, dtype: int64
   id  ...                                                                                                                                tweet
0   1  ...     #fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1   2  ...  Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2   3  ...          We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3   4  ...                     I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4   5  ...         What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!

[5 rows x 3 columns]


In [0]:
# data cleaning: remove URL's from train and test
train['clean_tweet'] = train['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))
test['clean_tweet'] = test['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

In [0]:
# remove twitter handles (@user)
train['clean_tweet'] = train['clean_tweet'].apply(lambda x: re.sub("@[\w]*", '', x))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: re.sub("@[\w]*", '', x))
  

In [0]:
# remove punctuation marks
punctuation = '.,\'!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'

train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))


In [0]:
# convert text to lowercase
train['clean_tweet'] = train['clean_tweet'].str.lower()
test['clean_tweet'] = test['clean_tweet'].str.lower()

In [0]:
# remove numbers
train['clean_tweet'] = train['clean_tweet'].str.replace("[0-9]", " ")
test['clean_tweet'] = test['clean_tweet'].str.replace("[0-9]", " ")

In [0]:
# remove whitespaces
train['clean_tweet'] = train['clean_tweet'].apply(lambda x:' '.join(x.split()))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ' '.join(x.split()))

In [0]:
#Normalize the words to its base form
# import spaCy's language model
nlp = spacy.load('en', disable=['parser', 'ner'])


In [0]:
# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

In [0]:
train['clean_tweet'] = lemmatization(train['clean_tweet'])
test['clean_tweet'] = lemmatization(test['clean_tweet'])


In [0]:
# Extract BERT embeddings function
def bert_vectors(x):
  
    # make a connection with the BERT server using it's ip address
    bc = BertClient()
    
    return bc.encode(x.tolist())


In [0]:
# Extract BERT embeddings
bert_train = bert_vectors(train['clean_tweet'])
bert_test = bert_vectors(test['clean_tweet'])


In [0]:
# save bert_train_new
pickle_out = open("bert_train_03032019.pickle","wb")
pickle.dump(bert_train, pickle_out)
pickle_out.close()

In [0]:
# save bert_test_new
pickle_out = open("bert_test_03032019.pickle","wb")
pickle.dump(bert_test, pickle_out)
pickle_out.close()

In [0]:
# load elmo_train_new
pickle_in = open("bert_train_03032019.pickle", "rb")
bert_train_new = pickle.load(pickle_in)


In [0]:
# load elmo_train_new
pickle_in = open("bert_test_03032019.pickle", "rb")
bert_test_new = pickle.load(pickle_in)


In [0]:
xtrain, xvalid, ytrain, yvalid = train_test_split(bert_train_new, 
                                                  train['label'],  
                                                  random_state=42, 
                                                  test_size=0.2)
print (ytrain.shape, yvalid.shape)

In [0]:
lreg = LogisticRegression()
lreg.fit(xtrain, ytrain)

In [0]:
preds_valid = lreg.predict(xvalid)
print (f1_score(yvalid, preds_valid))

In [0]:
# make predictions on test set
preds_test = lreg.predict(bert_test_new)

In [0]:
# prepare submission dataframe
sub = pd.DataFrame({'id':test['id'], 'label':preds_test})

# write predictions to a CSV file
sub.to_csv("sub_lreg_bert.csv", index=False)