In [5]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, f1_score, fbeta_score, precision_score, recall_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import ParameterGrid
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm.notebook import tqdm
import csv
from gensim.models.callbacks import CallbackAny2Vec
from itertools import groupby
from gensim.models import Word2Vec
import gensim
from pathlib import Path
import matplotlib.pyplot as plt
pd.options.display.max_columns = 999
import fasttext
from sklearn.model_selection import train_test_split

In [8]:
train = pd.read_csv('twitts.csv', delimiter=',')

In [10]:
test= pd.read_csv('test.csv', delimiter=',')

In [11]:
train.loc[train['keyword'].notnull(), 'text'] = train['text'] + '. ' + train['keyword']
test.loc[test['keyword'].notnull(), 'text'] = test['text'] + '. ' + test['keyword']

In [13]:
import re
compiler = re.compile("[\w'#:/.]+")
all_twitts=[]
for twitt in train.text:
    this_twitt=str()
    for word in compiler.findall(twitt.lower()):
        this_twitt+=word
        this_twitt+=' '
    all_twitts.append(this_twitt)

In [14]:
train['text']=all_twitts

In [15]:
all_twitts=[]
for twitt in test.text:
    this_twitt=str()
    for word in compiler.findall(twitt.lower()):
        this_twitt+=word
        this_twitt+=' '
    all_twitts.append(this_twitt)

In [16]:
test['text']=all_twitts

In [17]:
import string
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# NLTK Tweet Tokenizer for now
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True)

corpus = []

def clean_text(text):
    text = re.sub(r"\x89Û_", "", text)
    text = re.sub(r"\x89ÛÒ", "", text)
    text = re.sub(r"\x89ÛÓ", "", text)
    text = re.sub(r"\x89ÛÏWhen", "When", text)
    text = re.sub(r"\x89ÛÏ", "", text)
    text = re.sub(r"China\x89Ûªs", "China's", text)
    text = re.sub(r"let\x89Ûªs", "let's", text)
    text = re.sub(r"\x89Û÷", "", text)
    text = re.sub(r"\x89Ûª", "", text)
    text = re.sub(r"\x89Û\x9d", "", text)
    text = re.sub(r"å_", "", text)
    text = re.sub(r"\x89Û¢", "", text)
    text = re.sub(r"\x89Û¢åÊ", "", text)
    text = re.sub(r"fromåÊwounds", "from wounds", text)
    text = re.sub(r"åÊ", "", text)
    text = re.sub(r"åÈ", "", text)
    text = re.sub(r"JapÌ_n", "Japan", text)    
    text = re.sub(r"Ì©", "e", text)
    text = re.sub(r"å¨", "", text)
    text = re.sub(r"SuruÌ¤", "Suruc", text)
    text = re.sub(r"åÇ", "", text)
    text = re.sub(r"å£3million", "3 million", text)
    text = re.sub(r"åÀ", "", text)
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r'[0-9]', '', text)
    
    for p in string.punctuation.replace('!', ''):
        text = text.replace(p, '')
        
    text = re.sub(r'http\S+', '', text)
    return text

In [18]:
train['text'] = train['text'].apply(lambda s: clean_text(s))
test['text'] = test['text'].apply(lambda s: clean_text(s))

# see some cleaned data
train.sample(5)

Unnamed: 0,id,keyword,location,text,target
2178,3122,debris,,mh debris found on reunion island sad tragedy ...,1
1013,1472,body%20bagging,Arizona,imad i was going to tell him but you were body...,0
5053,7202,natural%20disaster,New York,rationing of food and water may also become ne...,0
101,145,accident,"Nairobi, Kenya",i still have not heard church leaders of kenya...,0
3973,5647,flooding,,crabbycale oh my god the memories are flooding...,0


In [19]:
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train.loc[train['id'].isin(ids_with_target_error),'target'] = 0

In [20]:
import re
compiler = re.compile("[\w'#:/.]+")

## Model with FastText

In [21]:
# This function makes the data suitable for input to FastText
def create_file(x, y, fname):
    with open(fname, 'w', encoding='utf-8') as f:
        for i, x_i in enumerate(x):
            if i > 0: f.write('\n')
            f.write(f'__label__{y[i]} {x_i}')

In [22]:
def predict(model, X):
    probs=[]
    for labels, scores in zip(*model.predict(X, k=2)):
        probs.append(dict(zip(labels, scores))['__label__1'])
    return probs

In [23]:
X_train, X_test, y_train, y_test = train_test_split(train['text'], train['target'], test_size=0.25, random_state=42)

In [24]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [25]:
create_file(X_train, y_train.to_list(), 'train.data')
create_file(X_test, y_test.to_list(), 'test.data')

In [26]:
model_ftext = fasttext.train_supervised(input='train.data',  minCount=3, lr=0.1, wordNgrams=1, minn=9, maxn=9, dim=100, epoch=5, loss='softmax')

In [27]:
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score
import numpy as np

print('--CLASS 1--')
print('Training ROC-AUC: ', round(roc_auc_score(y_train, predict(model_ftext, list(X_train))), 2))
print('Testing ROC-AUC', round(roc_auc_score(y_test, predict(model_ftext, list(X_test))), 2))
print('Training PR-AUC', round(average_precision_score(y_train, predict(model_ftext, list(X_train))),2))
print('Testing PR-AUC', round(average_precision_score(y_test, predict(model_ftext, list(X_test))),2))

--CLASS 1--
Training ROC-AUC:  0.92
Testing ROC-AUC 0.85
Training PR-AUC 0.91
Testing PR-AUC 0.83


In [28]:
see_preds = pd.DataFrame()
see_preds['twitt'] = X_test
see_preds['true'] = y_test
see_preds['proba'] = predict(model_ftext, list(X_test))
see_preds['prediction'] = see_preds['proba'].apply(lambda x: 1 if x>0.5 else 0)
see_preds['correct'] = [1 if x==y else 0 for x, y in zip(see_preds['prediction'], see_preds['true'])]
print('Accuracy: ', round(see_preds.correct.mean()*100, 2), '%')
see_preds.head(3)

Accuracy:  79.88 %


Unnamed: 0,twitt,true,proba,prediction,correct
0,so you have a new weapon that can cause un ima...,1,0.023621,0,0
1,the f amp ing things i do for gishwhes just go...,0,0.141299,0,1
2,dt georgegalloway rt gallowaymayor ûïthe col p...,1,0.563596,1,1


In [29]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

def evaluate_model(params):
    model= fasttext.train_supervised(input='train.data', 
                                     minCount=params['minCount'], 
                                     lr=params['lr'], 
                                     wordNgrams=params['wordNgrams'], 
                                     minn=params['minn'], 
                                     maxn=params['maxn'], 
                                     dim=params['dim'], 
                                     epoch=params['epoch'], 
                                     loss=params['loss_func'])
    
    see_preds = pd.DataFrame()
    see_preds['twitt'] = X_test
    see_preds['true'] = y_test
    see_preds['proba'] = predict(model, list(X_test))
    see_preds['prediction'] = see_preds['proba'].apply(lambda x: 1 if x>0.5 else 0)
    see_preds['correct'] = [1 if x==y else 0 for x, y in zip(see_preds['prediction'], see_preds['true'])]
    accuracy = round(see_preds.correct.mean()*100, 3)

    return {
        'minCount': params['minCount'],
        'lr': params['lr'], 
        'wordNgrams': params['wordNgrams'], 
        'dim': params['dim'],
        'epoch': params['epoch'],
        'loss_func': params['loss_func'],
        'minn': params['minn'],
        'maxn': params['maxn'],
        'Training ROC-AUC': round(roc_auc_score(y_train, predict(model, list(X_train))), 3),
        'Testing ROC-AUC': round(roc_auc_score(y_test, predict(model, list(X_test))), 3),
        'Training PR-AUC': round(average_precision_score(y_train, predict(model, list(X_train))),3),
        'Testing PR-AUC': round(average_precision_score(y_test, predict(model, list(X_test))), 3),
        'Testing ACCURACY': accuracy
    }

def objective(params):
    res = evaluate_model(params)
    
    res['loss'] = - res['Testing ACCURACY']
    res['status'] = STATUS_OK
    return res 

hyperparameter_space = {
        'minCount': hp.choice('minCount', range(1, 10)),
        'lr': hp.uniform('lr',0.0001, 0.3),
        'wordNgrams': hp.choice('wordNgrams', range(1, 4)),
        'dim': hp.choice('dim', range(5, 300)),
        'epoch': hp.choice('epoch', range(2, 25)),
        'loss_func': hp.choice('loss_func', ['hs', 'softmax', 'ns']),
        'minn': hp.choice('minn', range(2, 5)),
        'maxn': hp.choice('maxn', range(4,8))
}

In [30]:
trials = Trials()
fmin(
    objective,
    space=hyperparameter_space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
);

100%|█████████████████████████████████████████████████████████| 100/100 [07:14<00:00,  4.35s/trial, best loss: -81.355]


In [31]:
experiment_results = pd.DataFrame(trials.results)
experiment_results.sort_values(by='loss').head(7)

Unnamed: 0,minCount,lr,wordNgrams,dim,epoch,loss_func,minn,maxn,Training ROC-AUC,Testing ROC-AUC,Training PR-AUC,Testing PR-AUC,Testing ACCURACY,loss,status
24,9,0.167603,2,214,11,softmax,4,5,0.967,0.871,0.962,0.858,81.355,-81.355,ok
74,6,0.186716,2,113,11,softmax,4,6,0.969,0.87,0.964,0.856,81.355,-81.355,ok
27,1,0.159327,2,145,11,softmax,4,5,0.965,0.87,0.96,0.857,81.197,-81.197,ok
67,5,0.176593,2,22,11,softmax,4,5,0.975,0.871,0.971,0.859,81.145,-81.145,ok
85,9,0.142362,2,205,13,softmax,4,6,0.96,0.868,0.954,0.854,80.935,-80.935,ok
37,5,0.252513,2,209,11,softmax,4,5,0.99,0.871,0.988,0.86,80.935,-80.935,ok
31,7,0.298277,2,20,10,softmax,4,7,0.987,0.869,0.984,0.858,80.882,-80.882,ok


We get an accuracy of 81.3% and a ROC-AUC of 0.87 on the test set.