In [74]:
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import re
import pandas as pd
import numpy as np
import xgboost as xgb

In [4]:
file = open('./embeddings/CBoW.pkl', 'rb')
cbow = pickle.load(file)

In [45]:
file = open('./embeddings/skipgram.pkl', 'rb')
skipgram = pickle.load(file)

In [46]:
file = open('./embeddings/fasttext.pkl', 'rb')
fasttext = pickle.load(file)

In [10]:
df = pd.read_csv('./embeddings/train_embed_clean.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,keyword,location,text,target
0,0,1,,,deeds reason earthquake may allah forgive us,1
1,1,4,,,forest fire near la ronge sask canada,1
2,2,5,,,residents asked shelter place notified officer...,1
3,3,6,,,people receive wildfires evacuation orders cal...,1
4,4,7,,,got sent photo ruby alaska smoke wildfires pou...,1


In [48]:
zero_padding = [0]*10

def get_embeddings(text, emb):
    embedding = [0]*10
    i = 0
    for word in text.split(' '):
        if emb == "cbow":
            if word in cbow:
                i += 1
                embedding += cbow[word]
        
        elif emb == "skipgram":
            if word in skipgram:
                i += 1
                embedding += skipgram[word]
                
        elif emb == "fasttext":
            if word in fasttext:
                i += 1
                embedding += fasttext[word]
    if i != 0:
        embedding /= i
           
    return embedding

In [49]:
def transform(X, emb):
    embeddings = []
    for item in X:
        embedding = get_embeddings(item, emb)
        embeddings.append(embedding)
    
    return embeddings

In [62]:
cleaned_X = np.array(transform(df['text'], 'fasttext'))
y = np.array(df['target'])

In [63]:
# Hyperparameters
num_epochs = 10
batch_size = 128
learning_rate = 0.00001
dropout = 0.25

In [64]:
X_train, X_test, y_train, y_test = train_test_split(cleaned_X, y, test_size=0.15, random_state=42)

X_tr = torch.tensor(X_train, dtype=torch.float)
y_tr = torch.tensor(y_train)
train = TensorDataset(X_tr, y_tr)
trainloader = DataLoader(train, batch_size=batch_size)

X_te = torch.tensor(X_test, dtype=torch.float)
y_te = torch.tensor(y_test)
test = TensorDataset(X_te, y_te)
testloader = DataLoader(test)

In [65]:
class Net(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 500)
        self.hidden1 = nn.Linear(500, 500)
        self.hidden2 = nn.Linear(500, 500)
        self.fc2 = nn.Linear(500, 2)
        self.dropout = nn.Dropout(0.25)
        
        self.batchnorm1 = nn.BatchNorm1d(500)
        self.batchnorm2 = nn.BatchNorm1d(500)
        self.batchnorm3 = nn.BatchNorm1d(500)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.batchnorm1(x)
        x = self.dropout(x)
        x = F.relu(self.hidden1(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = F.relu(self.hidden2(x))
        x = self.batchnorm3(x)
        x = self.dropout(x)
#         x = self.fc2(x)
        x = F.log_softmax(self.fc2(x), dim=1)
        return x

net = Net()

In [66]:
# Loss function
criterion = nn.CrossEntropyLoss()

# create your optimizer
optimizer = optim.Adam(net.parameters(), lr=learning_rate)

In [67]:
# Training
net.train()
training_loss = []
for epoch in range(num_epochs):
    net.train()
    for i, data in enumerate(trainloader):
        inputs, labels = data

        outputs = net(inputs)
        loss = criterion(outputs, labels)
        training_loss.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print ('Epoch [%d/%d], Loss: %.4f'
                   %(epoch+1, num_epochs, loss.data))
        
    # Validation
    net.eval()
    outputs = net(X_te)
    _, predicted = torch.max(outputs, 1)
    total = y_te.size(0)
    correct = (predicted == y_te).sum()

    print(f'Accuracy of the model is: {100*correct/total:.2f}%')


Epoch [1/10], Loss: 0.6679
Accuracy of the model is: 63.92%
Epoch [2/10], Loss: 0.6787
Accuracy of the model is: 66.73%
Epoch [3/10], Loss: 0.6805
Accuracy of the model is: 67.60%
Epoch [4/10], Loss: 0.6763
Accuracy of the model is: 66.90%
Epoch [5/10], Loss: 0.6646
Accuracy of the model is: 66.99%
Epoch [6/10], Loss: 0.6348
Accuracy of the model is: 66.73%
Epoch [7/10], Loss: 0.7598
Accuracy of the model is: 66.64%
Epoch [8/10], Loss: 0.6547
Accuracy of the model is: 66.99%
Epoch [9/10], Loss: 0.6936
Accuracy of the model is: 66.81%
Epoch [10/10], Loss: 0.6603
Accuracy of the model is: 66.90%


In [68]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [69]:
cleaned_X_test = np.array(transform(df_test['text'], 'fasttext'))
X_test = torch.tensor(cleaned_X_test, dtype=torch.float)

In [70]:
# Testing
net.eval()
outputs_test = net(X_test)

_, predicted_test = torch.max(outputs_test, 1)

In [71]:
data = {'id': np.array(df_test['id']),
       'target': np.array(predicted_test)}

In [72]:
df_submission = pd.DataFrame(data)
df_submission.to_csv('submission_fasttext.csv', encoding='utf-8', index=False)

In [77]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                            gamma=0.25,
                            learn_rate=0.00001,
                            max_depth=5,
                            reg_lambda=0.25,
                            scale_pos_weight=1,
                            subsample=0.9,
                            n_estimators=30,
#                             use_label_encoder=False,
                            seed=42)
clf_xgb.fit(X_train,
            y_train,
            verbose=True,
            early_stopping_rounds=5,
            eval_metric='error',     # used error as well
            eval_set=[(X_test, y_test)])

bst = clf_xgb.get_booster()

Parameters: { "learn_rate" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBoostError: [21:21:41] C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/data/data.cc:592: Check failed: labels_.Size() == num_row_ (1142 vs. 3263) : Size of labels must equal to number of rows.