In [1]:
import numpy
import seaborn 
import pandas as pd
import matplotlib.pyplot as plt 
import nltk
from nltk.corpus import stopwords
import re
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\denik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
stop_words = set(stopwords.words('english')) 

In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http/S+', '', text)
    text = re.sub(r'@\w+','',text)
    text = re.sub(r'#\w+','',text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [7]:
train['clean_text'] = train['text'].apply(clean_text)
test['clean_text'] = test['text'].apply(clean_text)

In [8]:
def features(df):
    df['text_length'] = df['text'].str.len()
    df['word_count'] = df['text'].str.split().str.len()
    df['hashtag_count'] = df['text'].str.count('#')
    return df

In [9]:
train = features(train)
test = features(test)

In [10]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,3))

In [11]:
X_text = vectorizer.fit_transform(train['clean_text'])
X_test_kaggle = vectorizer.transform(test['clean_text'])

In [12]:
num_f_train = train[['text_length', 'word_count', 'hashtag_count']].values
num_f_test = test[['text_length', 'word_count', 'hashtag_count']].values

In [13]:
X = numpy.hstack((X_text.toarray(), num_f_train))
X_kaggle = numpy.hstack((X_test_kaggle.toarray(), num_f_test))

In [14]:
y = train['target']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
model = XGBClassifier(
    learning_rate=0.01,
    n_estimators=200,
    max_depth=7,
    subsample=0.8,
    gamma=0.5,
    reg_alpha=0.1,
    reg_lambda=0.1,
    tree_method='hist',
    device='cuda'
)

In [17]:
model.fit(X_train, y_train)

In [18]:
pred = model.predict(X_test)
pred_kaggle = model.predict(X_kaggle)

In [19]:
report = classification_report(y_test, pred)
print(report)

              precision    recall  f1-score   support

           0       0.68      0.94      0.79      1318
           1       0.83      0.39      0.53       966

    accuracy                           0.71      2284
   macro avg       0.75      0.67      0.66      2284
weighted avg       0.74      0.71      0.68      2284



In [20]:
submission = pd.DataFrame({'id': test['id'], 'target': pred_kaggle})
submission.to_csv('submission_nltk.csv', index=False)