In [1]:
import numpy
import seaborn 
import pandas as pd
import matplotlib.pyplot as plt 
import spacy
import re
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
nlp_spacy = spacy.load('en_core_web_sm')

In [5]:
def clean_text(text):
    doc = nlp_spacy(text.lower())
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

In [6]:
train['clean_text'] = train['text'].apply(clean_text)
test['clean_text'] = test['text'].apply(clean_text)

In [7]:
def features(df):
    df['text_length'] = df['text'].str.len()
    df['word_count'] = df['text'].str.split().str.len()
    df['hashtag_count'] = df['text'].str.count('#')
    return df

In [8]:
train = features(train)
test = features(test)

In [9]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))

In [10]:
X_text = vectorizer.fit_transform(train['clean_text'])
X_test_kaggle = vectorizer.transform(test['clean_text'])

In [11]:
num_f_train = train[['text_length', 'word_count', 'hashtag_count']].values
num_f_test = test[['text_length', 'word_count', 'hashtag_count']].values

In [12]:
X = numpy.hstack((X_text.toarray(), num_f_train))
X_kaggle = numpy.hstack((X_test_kaggle.toarray(), num_f_test))

In [13]:
y = train['target']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
model = XGBClassifier(
    learning_rate=0.01,
    n_estimators=200,
    max_depth=7,
    subsample=0.8,
    gamma=0.5,
    reg_alpha=0.1,
    reg_lambda=0.1,
    tree_method='hist',
    device='cuda'
)

In [16]:
model.fit(X_train, y_train)

In [17]:
pred = model.predict(X_test)
pred_kaggle = model.predict(X_kaggle)

In [18]:
report = classification_report(pred, y_test)

In [19]:
print(report)

              precision    recall  f1-score   support

           0       0.86      0.72      0.78      1591
           1       0.53      0.74      0.62       693

    accuracy                           0.72      2284
   macro avg       0.70      0.73      0.70      2284
weighted avg       0.76      0.72      0.73      2284



In [21]:
submission = pd.DataFrame({'id': test['id'], 'target': pred_kaggle})
submission.to_csv('submission_spacy.csv', index=False)