In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import _pickle as pickle
def save(file,name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'wb')
    else:
        outfile = open(name+'.pickle', 'wb')
    pickle.dump(file, outfile)
    outfile.close
    
def load(name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'rb')
    else:
        outfile = open(name+'.pickle', 'rb')
    file = pickle.load(outfile)
    outfile.close
    return file


import spacy #load spacy
nlp = spacy.load("en_core_web_lg")

In [None]:
train = 'train.csv'
test = 'test.csv'
sub = 'sample_submission.csv'

In [None]:
df = pd.read_csv('train.csv')

df['keyword'] = df['keyword'].fillna('Unknown')
df['location'] = df['location'].fillna('Unknown')
df['text'] = df['text'].fillna('Unknown')

def find_hash(x):
    hasht = []
    for elt in x.split(' '):
        if len(elt)>0:
            if elt[0] == '#':
                hasht.append(elt[1:].lower())
    return hasht

def lemmatize(x):
    lemmas = [token.lemma_.lower() for token in nlp(x)]
    return lemmas

def clean(X):
    remove = ".,: @!;-?&()'"
    Y = []
    
    for elt in X:
        if not(elt[:4] == 'http' or elt[0] in remove or elt[:2] == "\n" or len(elt) <= 1) or elt == '#':
            if elt != '#':
                Y.append(elt)
            else:
                Y.append('hashtag')
    return Y
df['tags'] = df['text'].apply(find_hash)
df['lemmatized'] = df['text'].apply(lemmatize)
df['lemmatized'] = df['lemmatized'].apply(clean)

In [None]:
df.head()

In [None]:
save(df, 'train_refined')

In [2]:
df = load('train_refined')

In [3]:
MAX_NB_WORDS  = 14000
MAX_SEQUENCE_LENGTH = 35
EMBEDDING_DIM = 300

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df['lemmatized'].values)
sequences = tokenizer.texts_to_sequences(df['lemmatized'].values)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
X = data
Y = df['target'].values

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [5]:
import os

GLOVE_DIR = './glove'

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.'+str(EMBEDDING_DIM)+'d.txt'),encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [6]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [7]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Activation, Conv1D, MaxPooling1D#CuDNNLSTM

nhid       = 64  # number of hidden units in the LSTM
n_classes  = 1
filters = 32
kernel_size = (5,5)


model = Sequential()

model.add(Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False))

# model.add(CuDNNLSTM(nhid, return_sequences=True))
# model.add(CuDNNLSTM(nhid, return_sequences=False))
model.add(LSTM(nhid,return_sequences=True))
model.add(LSTM(nhid,return_sequences=False))
model.add(Dense(100, activation='relu'))
model.add(Dense(n_classes, activation='sigmoid'))



In [16]:
tf.keras.backend.clear_session()
tf.config.optimizer.set_jit(False)

loss_classif     =  'binary_crossentropy'# find the right loss for multi-class classification
optimizer        =  'adam' # find the right optimizer
metrics_classif  =  ['accuracy']

model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)

In [17]:
bs = 5
n_epochs = 4
#, batch_size=bs
history = model.fit(X_train, y_train, batch_size=bs, epochs=n_epochs, validation_data=(X_test, y_test))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [21]:
tf.keras.backend.clear_session()
tf.config.optimizer.set_jit(True)

loss_classif     =  'binary_crossentropy'# find the right loss for multi-class classification
optimizer        =  'adam' # find the right optimizer
metrics_classif  =  ['accuracy']

model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)

In [22]:
bs = 5
n_epochs = 4
#, batch_size=bs
history = model.fit(X_train, y_train, batch_size=bs, epochs=n_epochs, validation_data=(X_test, y_test))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
pred = model.predict_classes(X_test)
pred = pred.reshape(pred.shape[0])

In [None]:
from sklearn.metrics import f1_score
f1_score(y_test, pred)