In [37]:
import pandas as pd
from pathlib import Path
import os
import re
from unicodedata import normalize
import string
import pickle as pkl

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score

In [66]:
path = Path('.').parent.absolute()

full_train = os.path.join(path, 'raw-dataset', 'train.csv')
train_df = pd.read_csv(full_train, encoding='utf-8')

full_test = os.path.join(path, 'raw-dataset', 'test.csv')
test_df = pd.read_csv(full_test, encoding='utf-8')

In [67]:
print(train_df[:10])

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   
5   8     NaN      NaN  #RockyFire Update => California Hwy. 20 closed...   
6  10     NaN      NaN  #flood #disaster Heavy rain causes flash flood...   
7  13     NaN      NaN  I'm on top of the hill and I can see a fire in...   
8  14     NaN      NaN  There's an emergency evacuation happening now ...   
9  15     NaN      NaN  I'm afraid that the tornado is coming to our a...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
5       1  
6       1  
7       1  
8       1  
9       1  


# Preprocess Tweet Data

In [78]:
def cleaning(tweet_text, df):
    temp = []
    table = str.maketrans("", "", string.punctuation)
    for tweet in tweet_text:
        # Remove links
        tweet = re.sub(r"http\S+", "", tweet)
        # Remove newline
        tweet = tweet.strip('\n')
        # Remove unicode
        tweet = normalize('NFKD', tweet).encode('ascii','ignore')
        # Remove username
        tweet = re.sub('@[^\s]+','',str(tweet))
        # Remove punctuation and change to lower case
        tweet = tweet.translate(table).lower()
        # Remove 'b' at the begining for binary
        tweet = tweet.replace('b', '', 1)
        # Remove whitespace at start of sentence
        tweet = tweet.strip()
#         # Remove numbers
#         tweet = ''.join([i for i in tweet if not i.isdigit()])
        temp.append(tweet)
    try:
        # Concatenate training with target
        processed_tweets = pd.concat([pd.DataFrame(temp), df['target']], axis=1)
        processed_tweets = pd.DataFrame(processed_tweets)
    except KeyError:
        processed_tweets = pd.DataFrame(temp)
    print(processed_tweets)
    return processed_tweets   

In [79]:
# Preprocess training and testing tweets
processed_tr_tweets = cleaning(train_df['text'], train_df)
processed_tst_tweets = cleaning(test_df['text'], test_df)

                                                      0  target
0     our deeds are the reason of this earthquake ma...       1
1                 forest fire near la ronge sask canada       1
2     all residents asked to shelter in place are be...       1
3     13000 people receive wildfires evacuation orde...       1
4     just got sent this photo from ruby alaska as s...       1
...                                                 ...     ...
7608  two giant cranes holding a bridge collapse int...       1
7609  the out of control wild fires in california ev...       1
7610               m194 0104 utc5km s of volcano hawaii       1
7611  police investigating after an ebike collided w...       1
7612  the latest more homes razed by northern califo...       1

[7613 rows x 2 columns]
                                                      0
0                    just happened a terrible car crash
1     heard about earthquake is different cities sta...
2     there is a forest fire at spot po

# Tokenization

In [80]:
def vectorize_tweets(count_vect, data):
    vect_tweets = count_vect.fit_transform(data)
    vect_tweets = vect_tweets.toarray()
    return vect_tweets, count_vect

In [81]:
# Convert a collection of text documents to a matrix of token counts
count_vect = CountVectorizer(analyzer='word', lowercase=False, stop_words='english')
# Combine both train and test
# Prevent unequal length of variables after tokenization
combined_tr_tst = pd.concat([processed_tr_tweets[0], processed_tst_tweets[0]], axis=0)
combined_vect,_ = vectorize_tweets(count_vect, combined_tr_tst)

# Check length
len_tr = len(processed_tr_tweets[0])
print('Training length: %d' %len_tr)
len_tst = len(processed_tst_tweets[0])
print('Testing length: %d' %len_tst)
print('Length of train + test: %d' %len(combined_vect))

# Split back to train and test
vect_tweets = combined_vect[:len_tr]
vect_tst_tweets = combined_vect[len_tr:]

Training length: 7613
Testing length: 3263
Length of train + test: 10876


# Training and Evaluation

In [82]:
# Split training and testing
X_train, X_test, y_train, y_test  = train_test_split(
        vect_tweets, 
        processed_tr_tweets['target'],
        train_size=0.80, 
        random_state=True,
        shuffle=True
)

## Using Logistic Regression

In [83]:
# Train
model_rg = LogisticRegression(solver='lbfgs', verbose=2)
model_rg = model_rg.fit(X=X_train, y=y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.5s finished


In [84]:
# Evaluate model
y_pred = model_rg.predict(X_test)
print(len(y_pred))
print(accuracy_score(y_test, y_pred))

1523
0.804333552199606


## Using SVM

In [56]:
model_svc = LinearSVC(random_state=True, verbose=2)
model_svc = model_svc.fit(X=X_train, y=y_train)

[LibLinear]

In [57]:
# Evaluate model
y_pred = model_svc.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.7747866053841103


## Using Naive Bayes

In [58]:
model_nb = GaussianNB()
model_nb = model_nb.fit(X=X_train, y=y_train)

In [59]:
# Evaluate model
y_pred = model_nb.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.6145764937623113


## Using SGD Classifier

In [60]:
model_sgd = SGDClassifier(max_iter=1000)
model_sgd = model_sgd.fit(X=X_train, y=y_train)

In [61]:
# Evaluate model
y_pred = model_sgd.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.768220617202889


## Using Random Forest Regressor

In [40]:
# model_rf = RandomForestRegressor(max_depth=2, random_state=0)
# model_rf = model_rf.fit(X=X_train, y=y_train)

In [41]:
# # Evaluate model
# y_pred = model_rf.predict(X_test)
# print(accuracy_score(y_test, y_pred))

# Predict new data

In [10]:
# Predict
new_prediction = model.predict(vect_tst_tweets)
new_prediction = pd.DataFrame(new_prediction)
new_prediction = pd.concat([test_df['id'], new_prediction], axis=1)
print(new_prediction)

         id  0
0         0  1
1         2  1
2         3  1
3         9  1
4        11  1
...     ... ..
3258  10861  1
3259  10865  1
3260  10868  1
3261  10874  1
3262  10875  0

[3263 rows x 2 columns]


In [11]:
def save_submission(new_prediction, fname):
    new_prediction = new_prediction.rename({0: 'target'}, axis=1) 
    new_prediction.to_csv(fname, index=False)

# save_submission(new_prediction, 'submission2.csv')

# Using Deep Learning

In [12]:
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *
from tensorflow.keras import optimizers
import math

In [13]:
def define_model(input_len, output_len):
    n_hidden_1 = math.ceil(input_len / 2)
    n_hidden_2 = math.ceil(n_hidden_1 / 2)
    n_hidden_3 = n_hidden_2
    n_hidden_4 = math.ceil(input_len / 2)

    Inp = Input(shape=(input_len, ))
    x = Dense(n_hidden_1, activation='relu', name = "Hidden_Layer_1")(Inp)
    x = Dropout(0.3)(x)
    x = Dense(n_hidden_2, activation='relu', name = "Hidden_Layer_2")(x)
    x = Dropout(0.3)(x)
    x = Dense(n_hidden_3, activation='relu', name = "Hidden_Layer_3")(x)
    x = Dropout(0.3)(x)
    x = Dense(n_hidden_4, activation='relu', name = "Hidden_Layer_4")(x)
    output = Dense(output_len, activation='softmax', name = "Output_Layer")(x)
                
    model = Model(Inp, output)
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    model.summary()
    return model

In [14]:
input_len = X_train.shape[1]
model = define_model(input_len, 2)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 19847)]           0         
_________________________________________________________________
Hidden_Layer_1 (Dense)       (None, 9924)              196971552 
_________________________________________________________________
dropout (Dropout)            (None, 9924)              0         
_________________________________________________________________
Hidden_Layer_2 (Dense)       (None, 4962)              49247850  
_________________________________________________________________
dropout_1 (Dropout)          (None, 4962)              0         
_________________________________________________________________
Hidden_Layer_3 (Dense)       (None, 4962)              24626406  
_________________________________________________________________
dropout_2 (Dropout)          (None, 4962)              0     

In [17]:
def train(model, X_train, X_test, y_train, y_test):
    # Hyperparameters
    learning_rate = 0.1
    adam = keras.optimizers.Adam(lr=learning_rate)

    model.fit(
            X_train, y_train,
            batch_size = 100,
            epochs = 1,
            validation_data=(X_test, y_test),
            shuffle=True
        )
    return model

In [18]:
y_train_np = y_train.to_numpy()
y_test_np = y_test.to_numpy()
model = train(model, X_train, X_test, y_train_np, y_test_np)

Train on 6090 samples, validate on 1523 samples


In [19]:
# Predict
dl_predictions = pd.DataFrame(model.predict(vect_tst_tweets))
dl_rounded = pd.DataFrame([int(x) for x in dl_predictions[1]])
print(dl_rounded)

      0
0     0
1     0
2     0
3     0
4     0
...  ..
3258  0
3259  0
3260  0
3261  0
3262  0

[3263 rows x 1 columns]


In [20]:
format_predictions = pd.concat([test_df['id'], dl_rounded], axis=1)
print(format_predictions)

# save_submission(format_predictions, 'submission3.csv')

         id  0
0         0  0
1         2  0
2         3  0
3         9  0
4        11  0
...     ... ..
3258  10861  0
3259  10865  0
3260  10868  0
3261  10874  0
3262  10875  0

[3263 rows x 2 columns]
