https://github.com/andikarachman/News-Title-Classification/blob/master/News_Title_Classification.ipynb

### Preparation

In [1]:

# Data analysis packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

# Deep learning packages
import torch
import torch.nn as nn
import torch.utils.data
from torch.utils.data import TensorDataset, DataLoader

# Miscellaneous
!pip install  -U bcolz
import bcolz
import pickle
import re



Collecting bcolz
[?25l  Downloading https://files.pythonhosted.org/packages/5c/4e/23942de9d5c0fb16f10335fa83e52b431bcb8c0d4a8419c9ac206268c279/bcolz-1.2.1.tar.gz (1.5MB)
[K     |▎                               | 10kB 20.2MB/s eta 0:00:01[K     |▌                               | 20kB 27.1MB/s eta 0:00:01[K     |▊                               | 30kB 19.2MB/s eta 0:00:01[K     |█                               | 40kB 22.4MB/s eta 0:00:01[K     |█▏                              | 51kB 25.4MB/s eta 0:00:01[K     |█▍                              | 61kB 20.6MB/s eta 0:00:01[K     |█▋                              | 71kB 16.2MB/s eta 0:00:01[K     |█▉                              | 81kB 14.8MB/s eta 0:00:01[K     |██                              | 92kB 15.6MB/s eta 0:00:01[K     |██▎                             | 102kB 15.2MB/s eta 0:00:01[K     |██▌                             | 112kB 15.2MB/s eta 0:00:01[K     |██▊                             | 122kB 15.2MB/s eta 0:00:

In [2]:
# Loading data 
! git clone https://github.com/chaoyangzhengnash/ML2-Homework
reference = pd.read_csv('ML2-Homework/reference.csv', sep=",", header = 'infer')
test = pd.read_csv('ML2-Homework/test.csv', sep=",", header = 'infer')
text = pd.read_csv('ML2-Homework/text.csv', sep=",", header = 'infer')
train = pd.read_csv('ML2-Homework/train.csv', sep=",", header = 'infer')
sample = pd.read_csv('ML2-Homework/sample.csv', sep=",", header = 'infer')


Cloning into 'ML2-Homework'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 20 (delta 4), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (20/20), done.


In [3]:
train_full = train.merge(text, how='left', left_on='id', right_on='id')
# Drop weird rows
train_full.drop(10218, inplace=True)

test_full = test.merge(text, how='left', left_on='id', right_on='id')
test_full['label'] = 99

result = pd.concat([train_full, test_full],sort=False,ignore_index= True)
train_full = result

titles = train_full['title']
labels = train_full['label']
print(train_full.head())
print("-------------------")
labels.value_counts()


   id  label                                              title
0   0      1  interactive visual exploration of neighbor bas...
1   3      1  relational division four algorithms and their ...
2   6      1  simplifying xml schema effortless handling of ...
3   8      0  funbase a function based information managemen...
4   9      0  inverted matrix efficient discovery of frequen...
-------------------


99    12782
0      2936
1      2921
2      2839
3      2510
4      1572
Name: label, dtype: int64

### Data Processing 

In [4]:
# Lowercase all words
titles = titles.apply(lambda x: x.lower())
# tokenize all titles in the data
titles_token = titles.apply(lambda x: x.split())

# Remove stop words
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

titles_token2 = []

for item in titles_token:
    temp = []
    for x in item:
        if x not in stop_words:
            temp.append(x)
    titles_token2.append(temp)

titles_token = pd.Series((i for i in titles_token2)) 

#a = list.index([s for s in enumerate(titles_token) if len(s) == 0])

#titles_token = [s for s in titles_token if len(s) > 0]

titles_token = pd.Series((i for i in titles_token)) 

print('Average word length of titles is {0:.0f}.'.format(np.mean(titles_token.apply(lambda x: len(x)))))
print('Max word length of titles is {0:.0f}.'.format(np.max(titles_token.apply(lambda x: len(x)))))
print('Min word length of titles is {0:.0f}.'.format(np.min(titles_token.apply(lambda x: len(x)))))
print()
print("--------- Distribution of word length of titles ---------")

#-----------------------------------------------------------------
count = [len(title) for title in titles_token]
print(pd.Series(count).value_counts())

#-----------------------------------------------------------------
print()
print("--------- Words occurence ---------")
def track_vocab(sentences, verbose =  True):
    
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
                
    return vocab
# count the occurrence of all words in the data
vocab_count = track_vocab(titles_token)
print({k: vocab_count[k] for k in list(vocab_count)[:10]})


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Average word length of titles is 6.
Max word length of titles is 22.
Min word length of titles is 1.

--------- Distribution of word length of titles ---------
6     5134
5     4905
7     4150
4     3174
8     2806
9     1728
3     1448
10     957
11     453
2      408
12     211
13     102
14      37
1       21
17       8
15       8
16       7
18       1
20       1
22       1
dtype: int64

--------- Words occurence ---------
{'interactive': 220, 'visual': 129, 'exploration': 78, 'neighbor': 40, 'based': 2126, 'patterns': 209, 'data': 2204, 'streams': 164, 'relational': 472, 'division': 7}


In [0]:
#Encode the Data
def create_lookup_tables(vocab_count):
    
    # sorting the words from most to least frequent in text occurrence
    sorted_vocab = sorted(vocab_count, key=vocab_count.get, reverse=True)
    # create vocab_to_int dictionary
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
    
    # return tuple
    return (vocab_to_int, int_to_vocab)

vocab_to_int, int_to_vocab = create_lookup_tables(vocab_count)

# encode the data
title_ints = []
for title in titles_token:
    title_ints.append([vocab_to_int[word] for word in title])

In [0]:
def pad_features(sentences_token, seq_length):
    # getting the correct rows x cols shape
    features = np.zeros((len(sentences_token), seq_length), dtype=int)

    # for each title, I grab that title and 
    for i, row in enumerate(sentences_token):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

# pad the titles
seq_length = 22
features = pad_features(title_ints, seq_length)

###Define Training, Validation, and Test Set




In [7]:
features

array([[    0,     0,     0, ...,   117,     0,   158],
       [    0,     0,     0, ...,  1246,    45,    44],
       [    0,     0,     0, ...,  2075,   936,   446],
       ...,
       [    0,     0,     0, ...,    81,   135,    28],
       [    0,     0,     0, ...,  1683,    53, 13158],
       [    0,     0,     0, ...,   857,   153,  1710]])

In [0]:
# Pick up real test data 
features_a = features[0:12778]
test_X_real = features[12778:len(labels)]
features = features_a

labels_a = labels[0:12778]
test_y_real = labels[12778:len(labels)]
labels = labels_a

# Define Training, Validation, and Test Set

train_X, val_test_X, train_y, val_test_y = train_test_split(features, labels, 
                                                            test_size=0.2, 
                                                            random_state=42, shuffle=True,
                                                            stratify=labels)

val_X, test_X, val_y, test_y = train_test_split(val_test_X, val_test_y, 
                                                test_size=0.5, 
                                                random_state=42, shuffle=True,
                                                stratify=val_test_y)                                                      

In [0]:
# define data loaders 
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(np.asarray(train_X)), torch.from_numpy(np.asarray(train_y)))
valid_data = TensorDataset(torch.from_numpy(np.asarray(val_X)), torch.from_numpy(np.asarray(val_y)))
test_data = TensorDataset(torch.from_numpy(np.asarray(test_X)), torch.from_numpy(np.asarray(test_y)))
test_data_real = TensorDataset(torch.from_numpy(np.asarray(test_X_real)), torch.from_numpy(np.asarray(test_y_real)))

# dataloaders
batch_size = 50
num_workers = 8

# make sure to SHUFFLE the training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, num_workers=num_workers)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size, num_workers=num_workers)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, num_workers=num_workers)
test_loader_real = DataLoader(test_data_real, shuffle=False, batch_size=batch_size, num_workers=num_workers)


In [10]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 22])
Sample input: 
 tensor([[   0,    0,    0,  ..., 5958,  899,   13],
        [   0,    0,    0,  ...,    8,   48,   28],
        [   0,    0,    0,  ...,   10,  494,  482],
        ...,
        [   0,    0,    0,  ...,  299, 3017, 9247],
        [   0,    0,    0,  ..., 2915, 1622,  872],
        [   0,    0,    0,  ...,    2,  176, 1757]])

Sample label size:  torch.Size([50])
Sample label: 
 tensor([3, 3, 1, 0, 3, 3, 1, 2, 1, 2, 2, 0, 1, 1, 0, 0, 1, 2, 1, 3, 2, 4, 1, 0,
        3, 1, 4, 3, 0, 4, 0, 4, 3, 2, 1, 1, 4, 0, 2, 4, 4, 2, 2, 4, 2, 2, 1, 0,
        1, 1])


### 5.0. Build Network Architecture


In [11]:
# Preparation for glove
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2020-04-26 05:14:54--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-04-26 05:14:54--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-04-26 05:14:55--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [0]:
words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1), rootdir=f'glove.6B.300.dat', mode='w')

with open(f'glove.6B.300d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)
    
vectors = bcolz.carray(vectors[1:].reshape((400000, 300)), rootdir=f'glove.6B.300.dat', mode='w')
vectors.flush()
pickle.dump(words, open(f'glove.6B.300_words.pkl', 'wb'))
pickle.dump(word2idx, open(f'glove.6B.300_idx.pkl', 'wb'))

In [0]:
# create a dictionary that given a word returns its vector
vectors = bcolz.open(f'glove.6B.300.dat')[:]
words = pickle.load(open(f'glove.6B.300_words.pkl', 'rb'))
word2idx = pickle.load(open(f'glove.6B.300_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

In [0]:
# create an embedding layer, that is a dictionary mapping integer indices (that represent words) to dense vectors
sorted_vocab = sorted(vocab_count, key=vocab_count.get, reverse=True)
target_vocab = sorted_vocab
emb_dim = 300

matrix_len = len(target_vocab)
weights_matrix = np.zeros((matrix_len, emb_dim))
words_found = 0

for i, word in enumerate(target_vocab):
    try: 
        weights_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))

def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': torch.from_numpy(weights_matrix)})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

### Define RNN Architecture

In [15]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [0]:
# implementation of attention layer
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.kaiming_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
        
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim 
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / (torch.sum(a, 1, keepdim=True) + 1e-10)

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)

In [0]:
# define of rnn

import torch.nn as nn

class RNN(nn.Module):
    """
    The RNN model that will be used to perform classification.
    """

    def __init__(self, weights_matrix, output_size, hidden_dim, drop_prob=0.1):
        """
        Initialize the model by setting up the layers.
        """
        super(RNN, self).__init__()

        self.output_size = output_size
        self.hidden_dim = hidden_dim
        
        # embedding layers
        self.embedding, self.num_embeddings, self.embedding_dim = create_emb_layer(weights_matrix, True)
        
        # embedding dropout
        self.dropout = nn.Dropout2d(drop_prob)
        
        # First lstm and GRU layers
        self.lstm1 = nn.LSTM(self.embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.gru1 = nn.GRU(hidden_dim * 2, hidden_dim, bidirectional=True, batch_first=True)
        
        # attention layer
        self.attention = Attention(hidden_dim*2, seq_length)
        
        # Second lstm and GRU layers
        self.lstm2 = nn.LSTM(hidden_dim * 2, hidden_dim, batch_first=True, bidirectional=True)
        self.gru2 = nn.GRU(hidden_dim * 2, hidden_dim, bidirectional=True, batch_first=True)
        
        # linear
        self.fc = nn.Linear(hidden_dim * 2, 64)
        self.out = nn.Linear(64, self.output_size)
        
        self.relu = nn.ReLU()
        

    def forward(self, x):
        """
        Perform a forward pass of our model on some inputs.
        """
        batch_size = x.size(0)

        # embedding output
        x = x.long()
        embeds = self.embedding(x)
        embeds = torch.squeeze(torch.unsqueeze(embeds, 0))
        
        # lstm, gru, and attention outputs
        lstm_out, _ = self.lstm1(embeds)
        gru_out, _ = self.gru1(lstm_out)
        attention_out = self.attention(gru_out, 256)
        attention_out = attention_out.view(batch_size, -1, self.hidden_dim * 2)
        lstm_out, _ = self.lstm2(attention_out)
        gru_out, _ = self.gru2(lstm_out)
        
        # linear outputs
        out = gru_out.view(-1, gru_out.shape[2])
        fc_out = self.relu(self.fc(out))
        final_out = self.out(fc_out)
    
        return final_out

### # Instantiate the model w/ hyperparams


In [18]:
# Instantiate the model w/ hyperparams
weights_matrix = weights_matrix
output_size = 5
hidden_dim = 256

net = RNN(weights_matrix, output_size, hidden_dim)

print(net)

RNN(
  (embedding): Embedding(13159, 300)
  (dropout): Dropout2d(p=0.1, inplace=False)
  (lstm1): LSTM(300, 256, batch_first=True, bidirectional=True)
  (gru1): GRU(512, 256, batch_first=True, bidirectional=True)
  (attention): Attention()
  (lstm2): LSTM(512, 256, batch_first=True, bidirectional=True)
  (gru2): GRU(512, 256, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=64, bias=True)
  (out): Linear(in_features=64, out_features=5, bias=True)
  (relu): ReLU()
)


### Training

In [0]:
train_on_gpu = False

In [20]:
# Hyper params
lr = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
epochs = 3
counter = 0
print_every = 100
clip = 5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output = net(inputs)

        # calculate the loss and perform backprop
        loss = criterion(output, labels)
        loss.backward()
        
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output = net(inputs)
                val_loss = criterion(output, labels)

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/3... Step: 100... Loss: 1.145034... Val Loss: 1.250795
Epoch: 1/3... Step: 200... Loss: 1.167899... Val Loss: 1.082021
Epoch: 2/3... Step: 300... Loss: 0.837045... Val Loss: 0.978574
Epoch: 2/3... Step: 400... Loss: 0.834598... Val Loss: 0.878722
Epoch: 3/3... Step: 500... Loss: 1.088658... Val Loss: 0.865168
Epoch: 3/3... Step: 600... Loss: 0.761859... Val Loss: 0.817528


In [21]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

net.eval()
# iterate over test data
for inputs, labels in test_loader:

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    output = net(inputs)
    
    # calculate loss
    test_loss = criterion(output, labels)
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0, 1, 2, 3, 4)
    pred = output.data.max(1, keepdim=True)[1]  
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.886
Test accuracy: 0.680


### To get prediction for uploading

In [0]:
# Get test data loss and accuracy

preds = []
# iterate over test data
for inputs, labels in test_loader_real:

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    output = net(inputs)
    
    # convert output probabilities to predicted class (0, 1, 2, 3, 4)
    pred = output.data.max(1, keepdim=True)[1]  
    preds.append(pred.numpy())


### save prediction

In [0]:
preds2 = np.concatenate( preds, axis=0 )
test["label"] = preds2

In [0]:
test.to_csv(r'preds.csv', index = False)
