In [1]:

from google.colab import files


uploaded = files.upload()

Saving train.csv to train.csv


In [22]:
#Import all the libraries

import pandas as pd
import numpy as np
import gensim
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier


In [6]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
data = df['text'].to_list()
data[:5]

['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
 'Forest fire near La Ronge Sask. Canada',
 "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
 '13,000 people receive #wildfires evacuation orders in California ',
 'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school ']

In [17]:
# preproces the documents, and create TaggedDocuments
tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()),tags=[str(i)]) for i,doc in enumerate(data)]

# train the Doc2vec model
model = Doc2Vec(vector_size=50,min_count=2, epochs=50)
model.build_vocab(tagged_data)
model.train(tagged_data,total_examples=model.corpus_count,epochs=model.epochs)

# get the document vectors
document_vectors = [model.infer_vector(word_tokenize(doc.lower())) for doc in data]


In [19]:
document_vectors[:1]

[array([ 3.78207237e-01, -7.85690486e-01, -6.54518783e-01, -2.57729083e-01,
         1.98057905e-01,  2.13794023e-01,  3.41895252e-01,  7.46453553e-02,
         4.58630957e-02, -4.45083454e-02,  2.36891270e-01, -5.22373497e-01,
         5.55396020e-01,  3.69997276e-03, -8.77223015e-01,  2.76478529e-01,
         2.87715733e-01,  2.03314066e-01,  1.03345618e-01,  3.61324906e-01,
        -7.05855489e-01, -9.91988331e-02,  5.15902102e-01, -1.38630837e-01,
         5.09497285e-01,  3.97163272e-01, -1.75718144e-01, -6.60271227e-01,
        -3.56679738e-01,  1.33702725e-01,  8.48764256e-02, -1.22047317e+00,
        -4.25756544e-01,  2.91294575e-01, -4.68181938e-01,  7.96829224e-01,
        -1.18332416e-01,  7.93298602e-01,  2.56764054e-01, -1.96398705e-01,
         1.38557954e-02,  2.56970704e-01, -3.29132944e-01,  3.18189323e-01,
        -5.32407605e-04, -6.50241554e-01, -2.18855247e-01,  1.37302175e-01,
         1.76953897e-01, -6.37531817e-01], dtype=float32)]

In [20]:
y = df['target'].to_list()

#Split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(document_vectors, y, random_state = 42, stratify = y, test_size = 0.2)

X_train[:1]

[array([-8.43567669e-01,  7.90459439e-02, -8.45339000e-02, -3.12300593e-01,
        -2.40399376e-01, -1.12088986e-01,  1.29292727e-01,  1.11240840e+00,
        -3.32377464e-01, -1.57635584e-01, -1.49507046e-01,  3.60003971e-02,
        -2.13980839e-01,  3.75982404e-01, -5.15229166e-01, -7.49479532e-02,
         3.16803545e-01,  6.09688796e-02, -1.60565332e-01, -8.82332027e-01,
        -5.01964748e-01,  1.81441575e-01,  6.80975616e-01,  2.96638131e-01,
         2.55718589e-01,  6.85199872e-02, -4.63805556e-01,  3.30110490e-01,
        -8.68598104e-01,  5.13812125e-01,  2.93363541e-01, -9.52298418e-02,
        -7.01908588e-01,  1.61712363e-01, -5.91926455e-01,  1.31944597e-01,
         5.47749519e-01,  5.20080924e-01,  1.24701552e-01, -2.42716879e-01,
         2.75043994e-01, -5.56942761e-01,  1.20987065e-01, -1.38238609e-01,
         8.73358488e-01, -8.25003663e-05,  3.84027272e-01, -1.15322709e-01,
        -2.17120826e-01,  2.66967602e-02], dtype=float32)]

In [23]:
#Initialize an object of the XGBClassifier
xgb = XGBClassifier(random_state = 42, learning_rate = 0.1)

#hyperparameters to tune
param_grid = {
    'n_estimators' : [300,500],
    'max_depth' : [20,25],
    'min_child_weight' : [2,5,10]
}

#grid search to optimise hyperparameters
grid_search = GridSearchCV(estimator = xgb, param_grid = param_grid, cv = 5, n_jobs = -1, verbose =2)

# Fit grid search on training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [24]:
print("Best Parameters are : ", grid_search.best_params_)

Best Parameters are :  {'max_depth': 20, 'min_child_weight': 2, 'n_estimators': 300}


In [25]:
#Use the best estimator for prediction
xgb_best = grid_search.best_estimator_

y_pred = xgb_best.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

0.7235718975705844
