In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print('gpu', gpu)
    tf.config.experimental.set_memory_growth(gpu, True)
    print('memory growth:' , tf.config.experimental.get_memory_growth(gpu))

gpu PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
memory growth: True


In [2]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
submit_sample_data = pd.read_csv('sample_submission.csv')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/dawidkubicki/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Little bit of analyse and preprocessing

In [3]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


#### Split to X_train, y_train and X_test

In [4]:
y_train = train_data["target"]
X_train = train_data.drop(columns=["target", "id", "keyword", "location"])
X_test = test_data.drop(columns=["id", "keyword", "location"])

In [5]:
X_train.head()

Unnamed: 0,text
0,Our Deeds are the Reason of this #earthquake M...
1,Forest fire near La Ronge Sask. Canada
2,All residents asked to 'shelter in place' are ...
3,"13,000 people receive #wildfires evacuation or..."
4,Just got sent this photo from Ruby #Alaska as ...


### Clean text

In [6]:
def lowercase_text(text):
    return text.lower()

X_train["text"] = X_train["text"].apply(lambda x: lowercase_text(x))
X_test["text"] = X_test["text"].apply(lambda x: lowercase_text(x))

In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

X_train["text"] = X_train["text"].apply(lambda x: clean_text(x))
X_test["text"] = X_test["text"].apply(lambda x: clean_text(x))

#### Lemmatization

In [8]:
lemmatizer = WordNetLemmatizer()
X_train["text"] = X_train["text"].apply(lambda x: lemmatizer.lemmatize(x))
X_test["text"] = X_test["text"].apply(lambda x: lemmatizer.lemmatize(x))

In [9]:
X_test.head()

Unnamed: 0,text
0,just happened a terrible car crash
1,heard about earthquake is different cities sta...
2,there is a forest fire at spot pond geese are ...
3,apocalypse lighting spokane wildfires
4,typhoon soudelor kills in china and taiwan


#### Text preprocessing (Word embeddings)

In [10]:

twitter_features = np.array(X_train["text"])

twitter_labels = np.asarray(y_train).astype('float32').reshape((-1,1))


train_ds = tf.data.Dataset.from_tensor_slices((twitter_features, twitter_labels)).batch(128)

In [11]:
for example, label in train_ds.take(1):
    print('texts: ', example.numpy()[:3])
    print('\n')
    print('label: ', label.numpy()[:3])

texts:  [b'our deeds are the reason of this earthquake may allah forgive us all'
 b'forest fire near la ronge sask canada'
 b'all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected']


label:  [[1.]
 [1.]
 [1.]]


#### Create a text encoder

In [12]:
VOCAB_SIZE=1000
encoder = TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_ds.map(lambda text, label: text))

In [13]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is', 'for',
       'on', 'you', 'my', 'with', 'it', 'that', 'at', 'by', 'this'],
      dtype='<U17')

### Train the model

In [14]:
# #model class

# class TwitterModel(tf.keras.Model):
#     def __init__(self):
#         super(TwitterModel, self).__init__()
#         self.en = encoder
#         self.emb = tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True)
#         self.bid1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))
#         self.bid2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True))
#         self.d1 = tf.keras.layers.Dense(64, activation='relu')
#         self.drop = tf.keras.layers.Dropout(0.5)
#         self.d2 = tf.keras.layers.Dense(1, activation='sigmoid')
        
#     def call(self, x):
#         x = self.en(x)
#         x = self.emb(x)
#         x = self.bid1(x)
#         x = self.bid2(x)
#         x = self.drop(x)
#         return self.d2(x)
    
# model = TwitterModel()    

In [15]:
# #optimizer and loss function
# loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
# optimizer = tf.keras.optimizers.RMSprop()

In [16]:
# train_loss = tf.keras.metrics.Mean(name='train_loss')
# train_accuracy = tf.keras.metrics.BinaryAccuracy(name="train_accuracy")

In [17]:
# def train_step(text, labels):
#     with tf.GradientTape() as tape:
#         prediction = model(text, training=True)
#         loss = loss_object(labels, prediction)
        
#     gradients = tape.gradient(loss, model.trainable_variables)
#     optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
#     train_loss(loss)
#     train_accuracy(labels, prediction)

In [18]:
# EPOCHS = 10

# for epoch in range(EPOCHS):
#     train_loss.reset_states()
#     train_accuracy.reset_states()
    
#     for text, labels in train_ds:
#         train_step(text, labels)
        
#     print(f'Epoch: {epoch+1}', f'Loss: {train_loss.result()}, ', f'Accuracy: {train_accuracy.result() * 100}')