In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print('gpu', gpu)
    tf.config.experimental.set_memory_growth(gpu, True)
    print('memory growth:' , tf.config.experimental.get_memory_growth(gpu))

gpu PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
memory growth: True


In [2]:
#https://www.tensorflow.org/tutorials/text/text_classification_rnn

In [3]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from  sklearn.preprocessing import OrdinalEncoder
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
submit_sample_data = pd.read_csv('sample_submission.csv')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/dawidkubicki/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Little bit of analyse and preprocessing

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


#### Deal with NaN

In [6]:
train_keyword_nan = train_data["keyword"].isna().sum()
train_location_nan = train_data["location"].isna().sum()
test_keyword_nan = train_data["keyword"].isna().sum()
test_location_nan = train_data["location"].isna().sum()
print("Train Keyword NaN's: {} \nTrain Location NaN's: {}\nTrain Keyword NaN's: {} \nTrain Location NaN's: {}".format(train_keyword_nan, train_location_nan, test_keyword_nan, test_location_nan))

Train Keyword NaN's: 61 
Train Location NaN's: 2533
Train Keyword NaN's: 61 
Train Location NaN's: 2533


In [7]:
train_data['keyword'].fillna('not-defined', inplace=True)
train_data['location'].fillna('not-defined', inplace=True)
test_data['keyword'].fillna('not-defined', inplace=True)
test_data['location'].fillna('not-defined', inplace=True)

In [8]:
train_keyword_nan = train_data["keyword"].isna().sum()
train_location_nan = train_data["location"].isna().sum()
test_keyword_nan = train_data["keyword"].isna().sum()
test_location_nan = train_data["location"].isna().sum()
print("Train Keyword NaN's: {} \nTrain Location NaN's: {}\nTrain Keyword NaN's: {} \nTrain Location NaN's: {}".format(train_keyword_nan, train_location_nan, test_keyword_nan, test_location_nan))

Train Keyword NaN's: 0 
Train Location NaN's: 0
Train Keyword NaN's: 0 
Train Location NaN's: 0


In [9]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,not-defined,not-defined,Our Deeds are the Reason of this #earthquake M...,1
1,4,not-defined,not-defined,Forest fire near La Ronge Sask. Canada,1
2,5,not-defined,not-defined,All residents asked to 'shelter in place' are ...,1
3,6,not-defined,not-defined,"13,000 people receive #wildfires evacuation or...",1
4,7,not-defined,not-defined,Just got sent this photo from Ruby #Alaska as ...,1


In [10]:
test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,not-defined,not-defined,Just happened a terrible car crash
1,2,not-defined,not-defined,"Heard about #earthquake is different cities, s..."
2,3,not-defined,not-defined,"there is a forest fire at spot pond, geese are..."
3,9,not-defined,not-defined,Apocalypse lighting. #Spokane #wildfires
4,11,not-defined,not-defined,Typhoon Soudelor kills 28 in China and Taiwan


#### Split to X_train, y_train and X_test

In [11]:
y_train = train_data["target"]
X_train = train_data.drop(columns=["target", "id"])
X_test = test_data

### Clean text

In [12]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

X_train["text"] = X_train["text"].apply(lambda x: clean_text(x))
X_test["text"] = X_test["text"].apply(lambda x: clean_text(x))

#### Lemmatization

In [13]:
lemmatizer = WordNetLemmatizer()
X_train["text"] = X_train["text"].apply(lambda x: lemmatizer.lemmatize(x))
X_test["text"] = X_test["text"].apply(lambda x: lemmatizer.lemmatize(x))

In [14]:
X_train.head()

Unnamed: 0,keyword,location,text
0,not-defined,not-defined,our deeds are the reason of this earthquake ma...
1,not-defined,not-defined,forest fire near la ronge sask canada
2,not-defined,not-defined,all residents asked to shelter in place are be...
3,not-defined,not-defined,people receive wildfires evacuation orders in...
4,not-defined,not-defined,just got sent this photo from ruby alaska as s...


In [15]:
X_test.head()

Unnamed: 0,id,keyword,location,text
0,0,not-defined,not-defined,just happened a terrible car crash
1,2,not-defined,not-defined,heard about earthquake is different cities sta...
2,3,not-defined,not-defined,there is a forest fire at spot pond geese are ...
3,9,not-defined,not-defined,apocalypse lighting spokane wildfires
4,11,not-defined,not-defined,typhoon soudelor kills in china and taiwan


#### Convert to numpy array

In [16]:
print(len(X_train.keyword.unique()))
print(len(X_train.location.unique()))

ordinal_encoder = OrdinalEncoder()

X_train[["keyword"]] = ordinal_encoder.fit_transform(X_train[["keyword"]])
X_test[["keyword"]] = ordinal_encoder.fit_transform(X_test[["keyword"]])
X_train[["location"]] = ordinal_encoder.fit_transform(X_train[["location"]])
X_test[["location"]] = ordinal_encoder.fit_transform(X_test[["location"]])

222
3342


In [17]:
X_train.head()

Unnamed: 0,keyword,location,text
0,147,3143,our deeds are the reason of this earthquake ma...
1,147,3143,forest fire near la ronge sask canada
2,147,3143,all residents asked to shelter in place are be...
3,147,3143,people receive wildfires evacuation orders in...
4,147,3143,just got sent this photo from ruby alaska as s...


#### Text preprocessing (Word embeddings)

In [19]:
twitter_labels = np.array(y_train.copy())

only_text_features = np.array(X_train["text"])

train_ds = tf.data.Dataset.from_tensor_slices((only_text_features, twitter_labels)).batch(64)

#vocab size and number of words in sequence
vocab_size = 10000
sequence_length = 100

vectorize_layer = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length
)

text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)



### TF text to vector

In [77]:
twitter_features = np.array(X_train.copy())
twitter_labels = np.array(y_train.copy())

train_ds = tf.data.Dataset.from_tensor_slices((twitter_features, twitter_labels)).batch(128)

In [68]:
# res = text_word_tokenizer.texts_to_sequences(X_train)
# res = tf.keras.preprocessing.sequence.pad_sequences(res, padding='post')
# res

### Train the model

In [100]:
class TwitterModel(tf.keras.Model):
    def __init__(self):
        super(TwitterModel, self).__init__()
        self.enc = encoder
        #self.emb = tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=64, mask_zero=True)
        self.bid = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))
        self.d1 = tf.keras.layers.Dense(64, activation='relu')
        self.d2 = tf.keras.layers.Dense(1)
            
    def call(self, x):
        x = self.enc(x)
        x = self.emb(x)
        x = self.bid(x)
        x = self.d1(x)
        return self.d2(x)
    
model = TwitterModel()

In [82]:
#loss and optimizer

loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.RMSprop()

In [84]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.CategoricalAccuracy(name="train_accuracy")

In [86]:
def train_step(text, labels):
    with tf.GradientTape() as tape:
        predictions = model(text, training=True)
        loss = loss_object(labels, predictions)
        
    gradients=tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_loss(loss)
    train_accuracy(labels, predictions)

In [87]:
EPOCHS = 10

for epoch in range(EPOCHS):
    train_loss.reset_states()
    train_accuracy.reset_states()
    
    for text, labels in train_ds:
        train_step(text, labels)
        
    print(
        f'Epoch {epoch+1},'
        f'Loss {train_loss.result()}, '
        f'Accuracy {train_accuracy.result() * 100}, '
#         f'Test Loss: {test_loss.result()},'
#         f'Test Accuracy: {test_accuracy.result() *100}'
    )

Epoch 1,Loss 359.1626281738281, Accuracy 0.0, 
Epoch 2,Loss 313.2477111816406, Accuracy 3.3333334922790527, 
Epoch 3,Loss 308.8911437988281, Accuracy 15.000000953674316, 
Epoch 4,Loss 316.2915954589844, Accuracy 43.33333206176758, 
Epoch 5,Loss 327.1099853515625, Accuracy 43.33333206176758, 
Epoch 6,Loss 339.7171630859375, Accuracy 43.33333206176758, 
Epoch 7,Loss 353.84918212890625, Accuracy 43.33333206176758, 
Epoch 8,Loss 369.2035217285156, Accuracy 43.33333206176758, 
Epoch 9,Loss 385.3968505859375, Accuracy 43.33333206176758, 
Epoch 10,Loss 402.0020446777344, Accuracy 43.33333206176758, 
