In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install datasets torch scikit-learn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing       import OneHotEncoder
from sklearn.compose             import ColumnTransformer
from sklearn.pipeline            import Pipeline
from xgboost                     import XGBClassifier
from sklearn.metrics             import classification_report

train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
train.head()

In [None]:
#Need to fill in all the NaNs with 'unkown' in both train and test sets
for df in (train, test):
    df['keyword']  = df['keyword'].fillna('unknown')
    df['location'] = df['location'].fillna('unknown')
test.head()

In [None]:
#time to simply remove bloating characters.
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\.\S+', '', text) #remove urls
    text = re.sub(r'@\w+', '', text) #remove mentions
    text = re.sub(r'#', '', text) #remove hashtags
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train["Clean_text"] = train["text"].apply(preprocess_text)
test["Clean_text"] = test["text"].apply(preprocess_text)
train.head()

In [None]:
#OK to remove the old text column.
for df in (train, test):
    df.drop(columns=['text'], errors='ignore', inplace=True)

test.head()

In [None]:
#Now lets try to get a 80/20 Train/CV split
from sklearn.model_selection import train_test_split
X = train[['Clean_text','keyword','location']]
y = train['target']

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    stratify=y,
    test_size=0.2,
    random_state=42
)
print("Train size:", X_tr.shape, y_tr.shape)
print("Val   size:", X_val.shape, y_val.shape)

In [None]:
#what we need to do is go from text -> TFID vectorization for the XGBoost to use. 
preprocessor = ColumnTransformer([
    ('tfidf', TfidfVectorizer(
        max_features=20_000,
        ngram_range=(1,2),
        min_df=5
    ), 'Clean_text'),
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['keyword','location']), #basically doing OHE for us
])

pipe = Pipeline([
    ('pre', preprocessor),
    ('xgb', XGBClassifier( #this is the model
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1
    )),
])

In [None]:
pipe.fit(X_tr, y_tr)

In [None]:
y_pred = pipe.predict(X_val)
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, digits=4))

In [None]:
X_full = train[['Clean_text','keyword','location']]
y_full = train['target']
pipe.fit(X_full, y_full)

# 2) Prepare your test features
X_test = test[['Clean_text','keyword','location']]

# 3) Predict on test
test_preds = pipe.predict(X_test)

# 4) Load the sample submission file
submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

# 5) Overwrite its “target” column with your predictions
submission['target'] = test_preds

# 6) Save to disk
submission.to_csv('submission.csv', index=False)
