In [1]:
import pandas as pd
from pathlib import Path
import os
import re
from unicodedata import normalize
import string
import pickle as pkl

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, mean_absolute_error

# Algorithm
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [2]:
path = Path('.').parent.absolute()

full_train = os.path.join(path, 'raw-dataset', 'train.csv')
train_df = pd.read_csv(full_train, encoding='utf-8')

full_test = os.path.join(path, 'raw-dataset', 'test.csv')
test_df = pd.read_csv(full_test, encoding='utf-8')

In [3]:
train_df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


# Preprocess Tweet Data

In [4]:
def cleaning(tweet_text, df):
    temp = []
    table = str.maketrans("", "", string.punctuation)
    for tweet in tweet_text:
        # Remove links
        tweet = re.sub(r"http\S+", "", tweet)
        # Remove newline
        tweet = tweet.strip('\n')
        # Remove unicode
        tweet = normalize('NFKD', tweet).encode('ascii','ignore')
        # Remove username
        tweet = re.sub('@[^\s]+','',str(tweet))
        # Remove punctuation and change to lower case
        tweet = tweet.translate(table).lower()
        # Remove 'b' at the begining for binary
        tweet = tweet.replace('b', '', 1)
        # Remove whitespace at start of sentence
        tweet = tweet.strip()
#         # Remove numbers
#         tweet = ''.join([i for i in tweet if not i.isdigit()])
        temp.append(tweet)
    try:
        # Concatenate training with target
        processed_tweets = pd.concat([pd.DataFrame(temp), df['target']], axis=1)
        processed_tweets = pd.DataFrame(processed_tweets)
    except KeyError:
        processed_tweets = pd.DataFrame(temp)
#     print(processed_tweets)
    return processed_tweets   

In [5]:
# Preprocess training and testing tweets
processed_tr_tweets = cleaning(train_df['text'], train_df)
processed_tst_tweets = cleaning(test_df['text'], test_df)

processed_tr_tweets.head(10)

Unnamed: 0,0,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are be...,1
3,13000 people receive wildfires evacuation orde...,1
4,just got sent this photo from ruby alaska as s...,1
5,rockyfire update california hwy 20 closed in ...,1
6,flood disaster heavy rain causes flash floodin...,1
7,im on top of the hill and i can see a fire in ...,1
8,theres an emergency evacuation happening now i...,1
9,im afraid that the tornado is coming to our area,1


In [6]:
# Checking target occurences

processed_tr_tweets['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

# Tokenization

In [7]:
def vectorize_tweets(count_vect, data):
    vect_tweets = count_vect.fit_transform(data)
    print(type(vect_tweets))
    vect_tweets = vect_tweets.toarray()
    return vect_tweets, count_vect

In [8]:
# Convert a collection of text documents to a matrix of token counts
count_vect = CountVectorizer(analyzer='word', lowercase=False, stop_words='english')
# Combine both train and test
# Prevent unequal length of variables after tokenization
combined_tr_tst = pd.concat([processed_tr_tweets[0], processed_tst_tweets[0]], axis=0)
combined_vect, count_vect = vectorize_tweets(count_vect, combined_tr_tst)
print('length of Vocabulary: {}'.format(len(count_vect.vocabulary_)))

# Check length
len_tr = len(processed_tr_tweets[0])
print('Training length: %d' %len_tr)
len_tst = len(processed_tst_tweets[0])
print('Testing length: %d' %len_tst)
print('Length of train + test: %d' %len(combined_vect))

# Split back to train and test
vect_tweets = combined_vect[:len_tr]
vect_tst_tweets = combined_vect[len_tr:]

print(vect_tweets)

<class 'scipy.sparse.csr.csr_matrix'>
length of Vocabulary: 19847
Training length: 7613
Testing length: 3263
Length of train + test: 10876
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# Training and Evaluation

In [15]:
# Split training and testing
X_train, X_test, y_train, y_test  = train_test_split(
        vect_tweets, 
        processed_tr_tweets['target'],
        train_size=0.80, 
        random_state=True,
        shuffle=True
)

print(len(X_train))

6090


# Choosing Algorithm

In [10]:
 def test_classifier(X_train, X_test, y_train, y_test):
        algorithm = [
            RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
            LinearSVC(),
            LogisticRegression(solver='lbfgs'),
            KNeighborsClassifier(3)
        ]
        for i in range(len(algorithm)):
            print("=" * 40)
            print("Running : " + algorithm[i].__class__.__name__)
            model = algorithm[i]
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            accuracy = accuracy_score(y_test, predictions)
            print("Accuracy: {:.4%}".format(accuracy))

In [11]:
test_classifier(X_train, X_test, y_train, y_test)

Running : RandomForestClassifier
Accuracy: 57.9120%
Running : LinearSVC
Accuracy: 77.8070%
Running : LogisticRegression
Accuracy: 80.4334%
Running : KNeighborsClassifier
Accuracy: 69.8621%


In [14]:
a = [1, 1, 0, 0]
b = [1, 1, 2, 0]
accuracy = accuracy_score(b, a)
print("Accuracy: {:.4%}".format(accuracy))

Accuracy: 75.0000%


## Use Logistic Regression

In [16]:
# Train
model_rg = LogisticRegression(solver='lbfgs')
model_rg = model_rg.fit(X=X_train, y=y_train)

In [23]:
# Evaluate model
y_pred = model_rg.predict(X_test)
print(len(y_pred))
print(accuracy_score(y_test, y_pred))

1523
0.8030203545633617


## Grid Search

In [2]:
def run_grid_search():
    model = LogisticRegression()
    param_grid = {
        'solver': ['lbfgs', 'newton-cg', 'sag', 'saga', 'liblinear']
    }
    gs = GridSearchCV(model, param_grid, n_jobs=4, cv=5)
    gs.fit(X_train, y_train)
    print(_cv.best_params_)
    mse = mean_absolute_error(y_train, gs_cv.predict(X_train))
    print("Mean abs error Training : %.4f" % mse)
    mse = mean_absolute_error(y_test, gs_cv.predict(X_test))
    print("Mean abs error Training : %.4f" % mse)

In [3]:
# run_grid_search()

# Predict new data

In [None]:
# Predict
new_prediction = model_rg.predict(vect_tst_tweets)
new_prediction = pd.DataFrame(new_prediction)
new_prediction = pd.concat([test_df['id'], new_prediction], axis=1)
print(new_prediction)

In [None]:
def save_submission(new_prediction, fname):
    new_prediction = new_prediction.rename({0: 'target'}, axis=1) 
    new_prediction.to_csv(fname, index=False)
    print(new_prediction)

save_submission(new_prediction, 'submission2.csv')

# Using Deep Learning

In [None]:
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *
from tensorflow.keras import optimizers
import math

In [None]:
def define_model(input_len, output_len):
    n_hidden_1 = math.ceil(input_len / 2)
    n_hidden_2 = math.ceil(n_hidden_1 / 2)
    n_hidden_3 = n_hidden_2
    n_hidden_4 = math.ceil(input_len / 2)

    Inp = Input(shape=(input_len, ))
    x = Dense(n_hidden_1, activation='relu', name = "Hidden_Layer_1")(Inp)
    x = Dropout(0.3)(x)
    x = Dense(n_hidden_2, activation='relu', name = "Hidden_Layer_2")(x)
    x = Dropout(0.3)(x)
    x = Dense(n_hidden_3, activation='relu', name = "Hidden_Layer_3")(x)
    x = Dropout(0.3)(x)
    x = Dense(n_hidden_4, activation='relu', name = "Hidden_Layer_4")(x)
    output = Dense(output_len, activation='softmax', name = "Output_Layer")(x)
                
    model = Model(Inp, output)
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    model.summary()
    return model

In [None]:
input_len = X_train.shape[1]
model = define_model(input_len, 2)

In [None]:
def train(model, X_train, X_test, y_train, y_test):
    # Hyperparameters
    learning_rate = 0.1
    adam = keras.optimizers.Adam(lr=learning_rate)

    model.fit(
            X_train, y_train,
            batch_size = 100,
            epochs = 1,
            validation_data=(X_test, y_test),
            shuffle=True
        )
    return model

In [None]:
y_train_np = y_train.to_numpy()
y_test_np = y_test.to_numpy()
model = train(model, X_train, X_test, y_train_np, y_test_np)

In [None]:
# Predict
dl_predictions = pd.DataFrame(model.predict(vect_tst_tweets))
dl_rounded = pd.DataFrame([int(x) for x in dl_predictions[1]])
print(dl_rounded)

In [None]:
format_predictions = pd.concat([test_df['id'], dl_rounded], axis=1)
print(format_predictions)

# save_submission(format_predictions, 'submission3.csv')