In [129]:
import numpy as np
import pandas as pd
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pickle

from sklearn.metrics import f1_score
from torch.utils.data import TensorDataset, DataLoader
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

In [130]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [131]:
def clean(text):
    """ Function to clean the text """
    text = text.lower()
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    
    # Remove numbers from string
    texter = re.sub(pattern=r"[+-]?\d+(?:\.\d+)?", repl="", string=texter, count=0, flags=0)
    texter = texter.replace("  ", " ")
    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    texter = re.sub(r'[^\w\s]', '', texter)
    if texter == "":
        texter = ""
    return texter

In [132]:
clean_X = []
for item in df['text']:
    item = clean(item)
    clean_X.append(item)

In [133]:
vectorizer = CountVectorizer()
X = np.array(vectorizer.fit_transform(clean_X).toarray())
y = np.array(df['target'])

In [87]:
# vectorizer = TfidfVectorizer(min_df=3)
# X = np.array(vectorizer.fit_transform(clean_X).toarray())
# y = np.array(df['target'])

In [134]:
X.shape

(7613, 21344)

In [158]:
# Hyperparameters
num_epochs = 10
batch_size = 128
learning_rate = 0.00001
dropout = 0.25

In [159]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

X_tr = torch.tensor(X_train, dtype=torch.float)
y_tr = torch.tensor(y_train)
train = TensorDataset(X_tr, y_tr)
trainloader = DataLoader(train, batch_size=batch_size)

X_te = torch.tensor(X_test, dtype=torch.float)
y_te = torch.tensor(y_test)
test = TensorDataset(X_te, y_te)
testloader = DataLoader(test)

In [160]:
class Net(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(21344, 500)
#         self.fc1 = nn.Linear(4271, 500)
        self.hidden1 = nn.Linear(500, 500)
        self.hidden2 = nn.Linear(500, 500)
        self.fc2 = nn.Linear(500, 2)
        self.dropout = nn.Dropout(0.25)
        
        self.batchnorm1 = nn.BatchNorm1d(500)
        self.batchnorm2 = nn.BatchNorm1d(500)
        self.batchnorm3 = nn.BatchNorm1d(500)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.batchnorm1(x)
        x = self.dropout(x)
        x = F.relu(self.hidden1(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = F.relu(self.hidden2(x))
        x = self.batchnorm3(x)
        x = self.dropout(x)
#         x = self.fc2(x)
        x = F.log_softmax(self.fc2(x), dim=1)
        return x

net = Net()

In [161]:
# Loss function
criterion = nn.CrossEntropyLoss()

# create your optimizer
optimizer = optim.Adam(net.parameters(), lr=learning_rate)

In [162]:
# Training
net.train()
training_loss = []
for epoch in range(num_epochs):
    net.train()
    for i, data in enumerate(trainloader):
        inputs, labels = data

        outputs = net(inputs)
        loss = criterion(outputs, labels)
        training_loss.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print ('Epoch [%d/%d], Loss: %.4f'
                   %(epoch+1, num_epochs, loss.data))
    
    net.eval()
    outputs = net(X_te)
    _, predicted = torch.max(outputs, 1)
    total = y_te.size(0)
    correct = (predicted == y_te).sum()
    print(f'Accuracy of the model is: {100*correct/total:.2f}%')

Epoch [1/10], Loss: 0.7008
Accuracy of the model is: 56.13%
Epoch [2/10], Loss: 0.5563
Accuracy of the model is: 68.56%
Epoch [3/10], Loss: 0.5065
Accuracy of the model is: 71.37%
Epoch [4/10], Loss: 0.4358
Accuracy of the model is: 73.47%
Epoch [5/10], Loss: 0.3621
Accuracy of the model is: 74.87%
Epoch [6/10], Loss: 0.3777
Accuracy of the model is: 76.62%
Epoch [7/10], Loss: 0.2779
Accuracy of the model is: 76.80%
Epoch [8/10], Loss: 0.2461
Accuracy of the model is: 76.97%
Epoch [9/10], Loss: 0.2166
Accuracy of the model is: 77.50%
Epoch [10/10], Loss: 0.1760
Accuracy of the model is: 77.23%


In [163]:
# Validation
net.eval()
outputs = net(X_te)

_, predicted = torch.max(outputs, 1)

total = y_te.size(0)
correct = (predicted == y_te).sum()

print(f'Accuracy of the model is: {100*correct/total:.2f}%')

Accuracy of the model is: 77.23%


In [164]:
f1_score(y_te,predicted)

0.7123893805309734

In [165]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [174]:
clean_X_test = []
for item in df_test['text']:
    item = clean(item)
    clean_X_test.append(item)
    
X_test = np.array(vectorizer.transform(clean_X_test).toarray())
X_test = torch.tensor(X_test, dtype=torch.float)

In [175]:
# Testing
net.eval()
outputs_test = net(X_test)

_, predicted_test = torch.max(outputs_test, 1)

In [176]:
data = {'id': np.array(df_test['id']),
       'target': np.array(predicted_test)}

In [177]:
df_submission = pd.DataFrame(data)
df_submission.to_csv('submission_bow.csv', encoding='utf-8', index=False)