In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.layers import TextVectorization
from imblearn.over_sampling import SMOTE
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from spellchecker import SpellChecker
from string import punctuation
import pandas as pd
import numpy as np
import nltk
import contractions
import re

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daniyarkurmanbayev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/daniyarkurmanbayev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train.shape

(7613, 5)

In [9]:
train.groupby('target').sum()

Unnamed: 0_level_0,id
target,Unnamed: 1_level_1
0,22910330
1,18519120


In [40]:
print(train[train.target == 1].shape[0])
print(train[train.target == 0].shape[0])

3271
4342


In [41]:
train_texts = [train.at[i, 'text'] for i in range(train.shape[0])]
train_texts[:5]

['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
 'Forest fire near La Ronge Sask. Canada',
 "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
 '13,000 people receive #wildfires evacuation orders in California ',
 'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school ']

In [42]:
test_texts = [test.at[i, 'text'] for i in range(test.shape[0])]
test_texts[:5]

['Just happened a terrible car crash',
 'Heard about #earthquake is different cities, stay safe everyone.',
 'there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all',
 'Apocalypse lighting. #Spokane #wildfires',
 'Typhoon Soudelor kills 28 in China and Taiwan']

In [43]:
texts = train_texts

In [44]:
def remove_url(text):
    return re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", text)

In [45]:
texts = [remove_url(text) for text in texts]

In [46]:
texts = [word_tokenize(text) for text in texts]
texts[:5]

[['Our',
  'Deeds',
  'are',
  'the',
  'Reason',
  'of',
  'this',
  '#',
  'earthquake',
  'May',
  'ALLAH',
  'Forgive',
  'us',
  'all'],
 ['Forest', 'fire', 'near', 'La', 'Ronge', 'Sask', '.', 'Canada'],
 ['All',
  'residents',
  'asked',
  'to',
  "'shelter",
  'in',
  'place',
  "'",
  'are',
  'being',
  'notified',
  'by',
  'officers',
  '.',
  'No',
  'other',
  'evacuation',
  'or',
  'shelter',
  'in',
  'place',
  'orders',
  'are',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#',
  'wildfires',
  'evacuation',
  'orders',
  'in',
  'California'],
 ['Just',
  'got',
  'sent',
  'this',
  'photo',
  'from',
  'Ruby',
  '#',
  'Alaska',
  'as',
  'smoke',
  'from',
  '#',
  'wildfires',
  'pours',
  'into',
  'a',
  'school']]

In [47]:
texts = [[word for word in text if word not in stopwords.words('english')] for text in texts]
texts[:5]

[['Our',
  'Deeds',
  'Reason',
  '#',
  'earthquake',
  'May',
  'ALLAH',
  'Forgive',
  'us'],
 ['Forest', 'fire', 'near', 'La', 'Ronge', 'Sask', '.', 'Canada'],
 ['All',
  'residents',
  'asked',
  "'shelter",
  'place',
  "'",
  'notified',
  'officers',
  '.',
  'No',
  'evacuation',
  'shelter',
  'place',
  'orders',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#',
  'wildfires',
  'evacuation',
  'orders',
  'California'],
 ['Just',
  'got',
  'sent',
  'photo',
  'Ruby',
  '#',
  'Alaska',
  'smoke',
  '#',
  'wildfires',
  'pours',
  'school']]

In [48]:
texts = [[word.lower() for word in text] for text in texts]
texts[:5]

[['our',
  'deeds',
  'reason',
  '#',
  'earthquake',
  'may',
  'allah',
  'forgive',
  'us'],
 ['forest', 'fire', 'near', 'la', 'ronge', 'sask', '.', 'canada'],
 ['all',
  'residents',
  'asked',
  "'shelter",
  'place',
  "'",
  'notified',
  'officers',
  '.',
  'no',
  'evacuation',
  'shelter',
  'place',
  'orders',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#',
  'wildfires',
  'evacuation',
  'orders',
  'california'],
 ['just',
  'got',
  'sent',
  'photo',
  'ruby',
  '#',
  'alaska',
  'smoke',
  '#',
  'wildfires',
  'pours',
  'school']]

In [49]:
# spell = SpellChecker()
#
# texts = [[spell.correction(word) for word in text] for text in texts]
# texts[:5]

In [50]:
texts = [[contractions.fix(word) for word in text] for text in texts]
texts = [' '.join(text).split(' ') for text in texts]
texts[:5]

[['our',
  'deeds',
  'reason',
  '#',
  'earthquake',
  'may',
  'allah',
  'forgive',
  'us'],
 ['forest', 'fire', 'near', 'la', 'ronge', 'sask', '.', 'canada'],
 ['all',
  'residents',
  'asked',
  "'shelter",
  'place',
  "'",
  'notified',
  'officers',
  '.',
  'no',
  'evacuation',
  'shelter',
  'place',
  'orders',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#',
  'wildfires',
  'evacuation',
  'orders',
  'california'],
 ['just',
  'got',
  'sent',
  'photo',
  'ruby',
  '#',
  'alaska',
  'smoke',
  '#',
  'wildfires',
  'pours',
  'school']]

In [51]:
punctuation_cleaned = [symbol for symbol in punctuation if symbol not in '!#?']
punctuation_cleaned = ''.join(punctuation_cleaned)
punctuation_cleaned

'"$%&\'()*+,-./:;<=>@[\\]^_`{|}~'

In [52]:
texts = [[word for word in text if word not in punctuation_cleaned] for text in texts]
texts[:5]

[['our',
  'deeds',
  'reason',
  '#',
  'earthquake',
  'may',
  'allah',
  'forgive',
  'us'],
 ['forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada'],
 ['all',
  'residents',
  'asked',
  "'shelter",
  'place',
  'notified',
  'officers',
  'no',
  'evacuation',
  'shelter',
  'place',
  'orders',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#',
  'wildfires',
  'evacuation',
  'orders',
  'california'],
 ['just',
  'got',
  'sent',
  'photo',
  'ruby',
  '#',
  'alaska',
  'smoke',
  '#',
  'wildfires',
  'pours',
  'school']]

In [53]:
texts = [' '.join(text) for text in texts]
texts[:5]

['our deeds reason # earthquake may allah forgive us',
 'forest fire near la ronge sask canada',
 "all residents asked 'shelter place notified officers no evacuation shelter place orders expected",
 '13,000 people receive # wildfires evacuation orders california',
 'just got sent photo ruby # alaska smoke # wildfires pours school']

In [54]:
# vectorizer = TfidfVectorizer(ngram_range=(1, 2))
# X = vectorizer.fit_transform(texts).toarray()
# n_words = len(vectorizer.vocabulary_.keys())
# print(n_words)
# X

In [55]:
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
vectorizer.adapt(texts)

2022-01-30 22:13:05.209086: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [56]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [57]:
embeddings_index = {}

with open('glove/glove.6B.50d.txt') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [58]:
num_tokens = len(voc) + 2
embedding_dim = 50
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 12321 words (5443 misses)


In [59]:
X = np.array(texts)
y = train.target.to_numpy()

In [60]:
np.save('data/X.npy', X)
np.save('data/y.npy', y)
np.save('data/embedding_matrix.npy', embedding_matrix)
np.save('data/num_tokens.npy', num_tokens)
np.save('data/embedding_dim.npy', embedding_dim)