## XGBoost with Glove

In [45]:
import scipy
from scipy.sparse import *
import numpy as np
import pickle
import pandas as pd
from loading import load_train
from preprocessing import *

[nltk_data] Downloading package stopwords to /Users/franz/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/franz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
scipy.__version__

'1.8.1'

### Embedding

Load vocab

In [29]:
vocab = set()
with open("data/vocab.txt", "r") as vocab_file:
    for line in vocab_file:
        line_ = line.split()
        vocab.add(line_[1])
len(vocab)

114427

Load coocurrence matrix

In [92]:
embeddings = {}
with open("data/glove/glove.twitter.27B.200d.txt", "r") as glove_file:
    for line in glove_file:
        line_ = line.split()
        word = line_[0]
        vec = np.asarray(line_[1:], "float32")
        embeddings[word] = vec
len(embeddings.keys())

1193514

### Prepare data

In [93]:
df = load_train(full=True)
remove_tags(df)
tokenize(df)

In [94]:
df.y = df.y.apply(lambda y: 1 if y == 1 else 0)

In [97]:
def tweet_embedding(tokens):
    embedding = np.zeros(200)
    for token in tokens:
        if token in embeddings:
            embedding += embeddings[token]
    return embedding

In [98]:
df['embedding'] = df.x.apply(lambda tokens: tweet_embedding(tokens))

In [99]:
from sklearn.utils import shuffle
df = shuffle(df)

In [100]:
x_train = np.vstack(df.iloc[:180000].embedding)
y_train = df.iloc[:180000].y
x_test = np.vstack(df.iloc[180000:].embedding)
y_test = df.iloc[180000:].y

### XGBoost

In [79]:
import torch
import logging
from evaluation import evaluate

logging.basicConfig(level=logging.INFO)

In [101]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(x_train, y_train)
y_predict = xgb_model.predict(x_test)

In [102]:
evaluate(y_test, y_predict)

INFO:root:---
* accuracy: 0.805207327586207
* precision: 0.7923595572438504
* recall: 0.8271034080797909
* f1: 0.8093587865283622
* bce: 6.727987197763114
* auc: 0.8052104982834878
---


(0.805207327586207,
 0.7923595572438504,
 0.8271034080797909,
 0.8093587865283622,
 6.727987197763114,
 0.8052104982834878)

### Save test prediction

In [103]:
from loading import load_test

In [104]:
df2 = load_test()
remove_tags(df2)
tokenize(df2)

In [112]:
df2['embedding'] = df2.x.apply(lambda tokens: tweet_embedding(tokens))
predictions = xgb_model.predict(np.stack(df2.embedding))

In [135]:
out = pd.DataFrame(np.arange(1, len(out)+1), columns=["Id"])

In [137]:
out["Prediction"] = predictions
out.Prediction = out.Prediction.apply(lambda y:y if y == 1 else -1)

In [138]:
out

Unnamed: 0,Id,Prediction
0,1,-1
1,2,-1
2,3,-1
3,4,-1
4,5,-1
...,...,...
9995,9996,1
9996,9997,-1
9997,9998,-1
9998,9999,1


In [139]:
out.to_csv("xgb.csv", index=False)