In [2]:
import os
import pandas as pd

In [3]:
repo_path = "/home/dpap/Kaggle/DisasterTweets/"
data_path = "data"

os.chdir(repo_path)

In [4]:
os.listdir(data_path)

['sample_submission.csv', 'test.csv', 'train.csv']

In [5]:
df = pd.read_csv(os.path.join(data_path, "train.csv"))
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


### Train/Test Split

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_df, y_df = df["text"], df["target"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.25, random_state=0)

In [24]:
print("train:", X_train.shape)
print("test:", X_test.shape)

train: (5709,)
test: (1904,)


### TF-IDF

In [36]:
# Create the vectorizer method
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()

In [37]:
tfidf_vec.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [38]:
X_train_tfidf = tfidf_vec.transform(X_train).toarray()
X_test_tfidf = tfidf_vec.transform(X_test).toarray()

In [39]:
X_train_tfidf.shape

(5709, 17868)

#### Naive Bayes classifier

In [40]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train_tfidf, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [42]:
# Print out the model's accuracy
nb.score(X_test_tfidf, y_test)

0.6244747899159664

Deep Net classifier

In [289]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [290]:
class SimpleNet(nn.Module):
    
    def __init__(self, input_size):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, round(input_size / 2))
        self.fc2 = nn.Linear(round(input_size / 2), 100)
        self.fc3 = nn.Linear(100, 1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x
    
    def predict(self, x):
        #Apply sigmoid to output.
        pred = self.forward(x)
        return pred.round()

In [291]:
#Deeper NN
class SimpleNet_v2(nn.Module):
    
    def __init__(self, input_size):
        super(SimpleNet_v2, self).__init__()
        self.fc1 = nn.Linear(input_size, 1000)
        self.fc2 = nn.Linear(1000, 100)
        self.fc3 = nn.Linear(100, 100)
        self.fc4 = nn.Linear(100, 1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = torch.sigmoid(self.fc4(x))
        return x
    
    def predict(self, x):
        #Apply sigmoid to output.
        pred = self.forward(x)
        return pred.round()

In [292]:
simple_classifier = SimpleNet_v2(17868)

In [293]:
def save_model(epoch, model, optimizer, scheduler):
    train_state = {    
    'model' : model,
    'state_dict': model.state_dict(),
    'optimizer': optimizer.state_dict(),
    'scheduler': scheduler.state_dict()
    }
    torch.save(train_state, 'SimpleNet_v2.pt')
    pass

In [294]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(simple_classifier.parameters(), lr=0.0001, weight_decay = 0.0005);
lr_scheduler = ReduceLROnPlateau(optimizer, 'max', factor = 0.1, patience = 5, verbose=True)

# train_loader, test_loader = create_dataloaders(balanced_train_images, balanced_train_labels, pr_test_img, pr_test_labels)
best_accuracy = 0.0

## Training

In [300]:
%%time
EPOCH = 20
batchsize = 64

for epoch in range(EPOCH):  # loop over the dataset multiple times
    simple_classifier.train()
    running_loss = 0.0
    for i in range(0, len(y_train), batchsize):
        
        inputs = torch.tensor(X_train_tfidf[i:i+batchsize]).float()
        lbs = torch.tensor(y_train[i:i+batchsize].values).float()

        # zero the parameter gradients
        optimizer.zero_grad()
        
        # forward + backward + optimize
        outputs = simple_classifier(inputs)
        loss = criterion(outputs, lbs)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if (i/batchsize) % 30 == 29:    # print every 600 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, (i/batchsize) + 1, running_loss / 1000))
            running_loss = 0.0

    simple_classifier.eval()
    correct = 0
    total = 0
    for j in range(0, len(y_test), batchsize):
        labels = torch.tensor(y_test[j:j+batchsize].values)
        outputs = simple_classifier.predict(torch.tensor(X_test_tfidf[j:j+batchsize]).float())
        total += labels.shape[0]
        correct += outputs.squeeze().eq(labels).sum().item()
    cur_accuracy = correct / total
    print('Accuracy on EPOCH %d test images: %d %%' % (epoch+1, 100 * cur_accuracy))   
    lr_scheduler.step(cur_accuracy)
    if cur_accuracy >= best_accuracy:
            best_accuracy = cur_accuracy
            print("Saving current model!!!")
            print("Detailed Accuracy: %f" %(best_accuracy))
            save_model(epoch, simple_classifier, optimizer, lr_scheduler)
print('Finished Training')

[1,    30] loss: 0.008
[1,    60] loss: 0.007
[1,    90] loss: 0.007
Accuracy on EPOCH 1 test images: 81 %
[2,    30] loss: 0.008
[2,    60] loss: 0.007
[2,    90] loss: 0.006
Accuracy on EPOCH 2 test images: 81 %
Epoch    22: reducing learning rate of group 0 to 1.0000e-06.
[3,    30] loss: 0.007
[3,    60] loss: 0.006
[3,    90] loss: 0.006
Accuracy on EPOCH 3 test images: 81 %
[4,    30] loss: 0.007
[4,    60] loss: 0.006
[4,    90] loss: 0.006
Accuracy on EPOCH 4 test images: 81 %
[5,    30] loss: 0.007
[5,    60] loss: 0.006
[5,    90] loss: 0.006
Accuracy on EPOCH 5 test images: 81 %
[6,    30] loss: 0.007
[6,    60] loss: 0.006
[6,    90] loss: 0.006
Accuracy on EPOCH 6 test images: 81 %
[7,    30] loss: 0.007
[7,    60] loss: 0.006
[7,    90] loss: 0.006
Accuracy on EPOCH 7 test images: 81 %
[8,    30] loss: 0.007
[8,    60] loss: 0.006
[8,    90] loss: 0.006
Accuracy on EPOCH 8 test images: 81 %
Epoch    28: reducing learning rate of group 0 to 1.0000e-07.
[9,    30] loss: 0.0

In [301]:
# Load best model

best_classifier = simple_classifier

best_classifier.load_state_dict(torch.load('SimpleNet_v2.pt')['state_dict'])

<All keys matched successfully>

In [304]:
best_classifier.eval()
preds = best_classifier.predict(torch.tensor(X_test_tfidf).float()).detach().numpy()

In [307]:
(preds.squeeze() == y_test).sum() / y_test.shape[0]

0.9201261166579086

### Submit

In [328]:
df_test = pd.read_csv(os.path.join(data_path, "test.csv"))
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [329]:
submit_X, submit_y = df_test["text"], df_test["text"]

In [330]:
submit_X_tfidf = tfidf_vec.transform(submit_X).toarray()

In [331]:
submit_preds = best_classifier.predict(torch.tensor(submit_X_tfidf ).float()).detach().numpy().astype(int)

In [332]:
to_submit_df = pd.DataFrame(submit_preds, columns=["target"])

to_submit_df = pd.concat((df_test["id"], to_submit_df), axis = 1)

to_submit_df.to_csv("submission.csv", index=False)

In [333]:
!kaggle competitions submit -c nlp-getting-started  -f submission.csv -m "Test"

100%|██████████████████████████████████████| 22.2k/22.2k [00:02<00:00, 7.67kB/s]
Successfully submitted to Real or Not? NLP with Disaster Tweets