Code for CNN based model

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd /content/drive/MyDrive/NLP Team 3

/content/drive/.shortcut-targets-by-id/1trlqfp58enyS7BnzsPH-AWyMet7nZnDT/NLP Team 3


In [5]:
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
# from sentence_transformers import SentenceTransformer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import re
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv('train_glove_clean.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,keyword,location,text,target
0,0,1,,,deeds reason earthquake may allah forgive us,1
1,1,4,,,forest fire near la ronge sask canada,1
2,2,5,,,residents asked shelter place notified officer...,1
3,3,6,,,people receive wildfires evacuation orders cal...,1
4,4,7,,,got sent photo ruby alaska smoke wildfires pou...,1


In [7]:
def clean(text):
    """ Function to clean the text """
    text = text.lower()
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    
    # Remove numbers from string
    texter = re.sub(pattern=r"[+-]?\d+(?:\.\d+)?", repl="", string=texter, count=0, flags=0)
    texter = texter.replace("  ", " ")
    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    texter = re.sub(r'[^\w\s]', '', texter)
    if texter == "":
        texter = ""
    return texter

In [46]:
import pickle

with open('CBoW.pkl', 'rb') as f:
    vocab = pickle.load(f)

input_dim = len(list(vocab.values())[0])

In [47]:
max_len = 50
zero_padding = [0]*input_dim

def get_embeddings(text, emb="LSTM"):
    if emb == "DNN":
        embedding = [0]*input_dim
        i = 0
        for word in text.split(' '):
            if word in vocab:
                i += 1
                embedding += vocab[word]

        if i != 0:
            embedding /= i

#         scale = 0 # noise weight_scale
#         noise = np.random.randn(input_dim)*scale
#         embedding += noise
    elif emb == "LSTM" or emb == "CNN":
        embedding = []
        i = 0
        for word in text.split(' '):
            if i == max_len:
                break
            if word in vocab:
                i += 1
                embedding.append(vocab[word])
        
        while i < max_len:
            i += 1
            embedding.append(zero_padding)
            
    return embedding

In [48]:
def transform(X, emb="LSTM"):
    embeddings = []
    for item in X:
        item = clean(item)
        embedding = get_embeddings(item, emb)
        embeddings.append(embedding)
    
    return embeddings

In [49]:
## Create embeddings for input
# cleaned_X = np.array(transform(df['text'], "DNN"))
cleaned_X = np.array(transform(df['text'], "CNN"))
y = np.array(df['target'])

In [50]:
# Hyperparameters
batch_size = 128
dropout = 0.25

In [51]:
X_train, X_test, y_train, y_test = train_test_split(cleaned_X, y, test_size=0.1, random_state=42)

X_tr = torch.tensor(X_train, dtype=torch.float)
y_tr = torch.tensor(y_train)
train = TensorDataset(X_tr, y_tr)
trainloader = DataLoader(train, batch_size=batch_size)

X_te = torch.tensor(X_test, dtype=torch.float)
y_te = torch.tensor(y_test)
test = TensorDataset(X_te, y_te)
testloader = DataLoader(test)

In [53]:
class ConvNet(nn.Module):
    def __init__(self, num_classes=2):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc1 = nn.Linear(9600*2, 1000)
        self.fc2 = nn.Linear(1000, num_classes)
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc1(out)
        out = F.relu(out)
        out = self.fc2(out)
        return F.log_softmax(out, dim=1)

cnn = ConvNet(2)

In [54]:
num_epochs = 8
learning_rate = 0.001

In [55]:
# Loss function
criterion = nn.CrossEntropyLoss()

# create your optimizer
optimizer = optim.Adam(cnn.parameters(), lr=learning_rate)


In [56]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [57]:
cnn.to(device)

ConvNet(
  (layer1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Linear(in_features=19200, out_features=1000, bias=True)
  (fc2): Linear(in_features=1000, out_features=2, bias=True)
)

In [58]:
training_loss = []
for epoch in range(num_epochs):
    cnn.train()
    for i, data in enumerate(trainloader):
        inputs, labels = data
        inputs, labels = inputs.view(inputs.shape[0],1,inputs.shape[-2], inputs.shape[-1]).to(device), labels.to(device)

        outputs = cnn(inputs)
        loss = criterion(outputs, labels)
        training_loss.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 25 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
                   %(epoch+1, num_epochs, i+1, len(trainloader), loss.data))
    
    cnn.eval()
    outputs = cnn(X_te.view(X_te.shape[0],1,X_te.shape[-2], X_te.shape[-1]).to(device))

    _, predicted = torch.max(outputs, 1)

    total = y_te.size(0)
    correct = (predicted.cpu() == y_te).sum()

    print(f'Accuracy of the model is: {100*correct/total:.2f}%')

Epoch [1/8], Step [25/54], Loss: 0.6230
Epoch [1/8], Step [50/54], Loss: 0.7360
Accuracy of the model is: 57.09%
Epoch [2/8], Step [25/54], Loss: 0.5350
Epoch [2/8], Step [50/54], Loss: 0.7545
Accuracy of the model is: 67.06%
Epoch [3/8], Step [25/54], Loss: 0.5168
Epoch [3/8], Step [50/54], Loss: 0.6120
Accuracy of the model is: 71.78%
Epoch [4/8], Step [25/54], Loss: 0.5290
Epoch [4/8], Step [50/54], Loss: 0.6207
Accuracy of the model is: 71.52%
Epoch [5/8], Step [25/54], Loss: 0.4641
Epoch [5/8], Step [50/54], Loss: 0.6337
Accuracy of the model is: 71.00%
Epoch [6/8], Step [25/54], Loss: 0.4643
Epoch [6/8], Step [50/54], Loss: 0.6062
Accuracy of the model is: 69.95%
Epoch [7/8], Step [25/54], Loss: 0.4529
Epoch [7/8], Step [50/54], Loss: 0.6021
Accuracy of the model is: 73.23%
Epoch [8/8], Step [25/54], Loss: 0.4992
Epoch [8/8], Step [50/54], Loss: 0.5520
Accuracy of the model is: 70.08%


In [59]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [60]:
test_X = np.array(transform(test_df['text']))
test_X = torch.tensor(test_X, dtype=torch.float)

In [61]:
cnn.eval()
outputs_test = torch.zeros(test_X.shape[0], 2)
te_X = test_X.view(test_X.shape[0],1,test_X.shape[-2], test_X.shape[-1])
for i in range(0,test_X.shape[0],128):
  outputs_test[i:min(i+128,test_X.shape[0])] = cnn(te_X[i:min(i+128,test_X.shape[0])].to(device))

_, predicted_test = torch.max(outputs_test, 1)

In [62]:
data = {'id': np.array(test_df['id']),
       'target': np.array(predicted_test.cpu())}

In [63]:
df_submission = pd.DataFrame(data)
df_submission.to_csv('submission_cnn_cbow.csv', encoding='utf-8', index=False)