In [27]:
import xgboost as xgb
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [13]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [14]:
def clean(text):
    """ Function to clean the text """
    text = text.lower()
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    
    # Remove numbers from string
    texter = re.sub(pattern=r"[+-]?\d+(?:\.\d+)?", repl="", string=texter, count=0, flags=0)
    texter = texter.replace("  ", " ")
    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    texter = re.sub(r'[^\w\s]', '', texter)
    if texter == "":
        texter = ""
    return texter

In [15]:
clean_X = []
for item in df['text']:
    item = clean(item)
    clean_X.append(item)

In [25]:
vectorizer = CountVectorizer()
X = np.array(vectorizer.fit_transform(clean_X).toarray())
y = np.array(df['target'])

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                            gamma=0.25,
                            learn_rate=0.00001,
                            max_depth=10,
                            reg_lambda=0.25,
                            scale_pos_weight=1,
                            subsample=0.9,
                            n_estimators=30,
#                             use_label_encoder=False,
                            seed=42)
clf_xgb.fit(X_train,
            y_train,
            verbose=True,
            early_stopping_rounds=5,
            eval_metric='error',     # used error as well
            eval_set=[(X_val, y_val)])

bst = clf_xgb.get_booster()



Parameters: { "learn_rate" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-error:0.32574
[1]	validation_0-error:0.31699
[2]	validation_0-error:0.30560
[3]	validation_0-error:0.29860
[4]	validation_0-error:0.28634
[5]	validation_0-error:0.27846
[6]	validation_0-error:0.28021
[7]	validation_0-error:0.26620
[8]	validation_0-error:0.26883
[9]	validation_0-error:0.28021
[10]	validation_0-error:0.27233
[11]	validation_0-error:0.26795
[12]	validation_0-error:0.26532
[13]	validation_0-error:0.26007
[14]	validation_0-error:0.26007
[15]	validation_0-error:0.26007


<xgboost.core.Booster at 0x2dbce5caeb0>

In [37]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [38]:
clean_X_test = []
for item in df_test['text']:
    item = clean(item)
    clean_X_test.append(item)
    
X_test = np.array(vectorizer.transform(clean_X_test).toarray())
test = xgb.DMatrix(X_test)

In [56]:
preds = bst.predict(test)

In [57]:
preds = [1 if i > 0.5 else 0 for i in preds]

In [58]:
data = {'id': np.array(df_test['id']),
       'target': np.array(preds)}

In [59]:
df_submission = pd.DataFrame(data)
df_submission.to_csv('submission_xgboost.csv', encoding='utf-8', index=False)