In [1]:
import tensorflow as tf
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
gpus = tf.config.experimental.list_physical_devices('GPU')
gpus

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 2444349686822782119
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 15565539082527272732
physical_device_desc: "device: XLA_CPU device"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 9000867201190247734
physical_device_desc: "device: XLA_GPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 22712103808
locality {
  bus_id: 1
  links {
  }
}
incarnation: 5286263942335954487
physical_device_desc: "device: 0, name: TITAN RTX, pci bus id: 0000:42:00.0, compute capability: 7.5"
]


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from gensim.models import Word2Vec

from green_mood_tracker.training_data import get_raw_data_notebook
from green_mood_tracker.data_cleaning import clean

[nltk_data] Downloading package wordnet to /home/hamish/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## download and clean data

In [3]:
%%time
raw_data = get_raw_data_notebook(True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentiment140_final['source'] = 'sentiment140'


CPU times: user 3.02 s, sys: 284 ms, total: 3.3 s
Wall time: 3.35 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  complete_data_binary['polarity'] = complete_data_binary.polarity.map({4:1,0:0})


## sample data

In [4]:
data_sample = raw_data.sample(n=30_000,random_state=0).reset_index()

In [5]:
%%time
data_sample_clean = clean(data_sample,'text')
data_sample_clean.head()

CPU times: user 16.5 s, sys: 163 ms, total: 16.7 s
Wall time: 16.7 s


Unnamed: 0,index,id,text,polarity,source
0,336834,2014083611,pinch even fought amma hate new goatee sd look...,0,sentiment140
1,622734,2229303528,didnt even get try week,0,sentiment140
2,1016867,1881768019,giving jack quick hair cut,1,sentiment140
3,1090541,1969804803,pirate game sat sun work monday tuesthur work ...,1,sentiment140
4,947493,1823084268,fun night,1,sentiment140


## split data Train/Test

In [6]:
from sklearn.model_selection import train_test_split

X = data_sample_clean.text
y = data_sample_clean.polarity

In [7]:
sentence_train, sentence_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)

In [8]:
def plot_loss(history):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Mean Square Error - Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='best')
    plt.show()
    
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='best')
    plt.show()

## word2vec

In [9]:
word2vec = Word2Vec(sentences=sentence_train, min_count=50)
vocab_size=len(word2vec.wv.vocab.keys())

In [10]:
def embed_sentence(word2vec, sentence):
    embedded=[]
    embedded_sentence = []
    for word in sentence:
         if word in word2vec.wv.vocab.keys():
             vector = word2vec.wv[word]
             embedded_sentence.append(vector)                 
    return np.array(embedded_sentence)
        

In [11]:
def embedding(word2vec, sentences):
    embedding=[]
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)  
        embedding.append(embedded_sentence)
    return embedding

In [12]:
def embedding_pipeline(word2vec, X):
    # Step #3: Sentences to list of words
    X = embedding(word2vec, X) 
    # Step #4: Pad the inputs
    X = pad_sequences(X, dtype='float32', padding='post') 
    return X

In [13]:
X_train_pad_w2v = embedding_pipeline(word2vec, sentence_train.values)
X_test_pad_w2v = embedding_pipeline(word2vec, sentence_test.values)

In [14]:
def init_model_w2v():

    model = Sequential()

    model.add(layers.Masking())
    model.add(layers.GRU(units=32, activation='tanh')) 
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss= 'binary_crossentropy',  
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    
    return model

### Train word2vec on internal data

In [None]:
%%time
model_w2v = init_model_w2v()

es = EarlyStopping(patience=5,restore_best_weights=True,monitor='val_accuracy')
history_w2v = model_w2v.fit(X_train_pad_w2v, y_train,
          validation_split= 2/7,
          epochs=50, 
          batch_size=64, 
          verbose=1,
          use_multiprocessing=True,
          callbacks = [es])

Epoch 1/50

In [None]:
print('accuracy', model_w2v.evaluate(X_test_pad_w2v,y_test)[1])
plot_loss(history_w2v)

In [None]:
import joblib

filename = 'model_w2v_binary.joblib'
joblib.dump(model_w2v, filename)