# Classification using a simple neural network

## Import Modules

In [1]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F

## Define a neural network for classification

In [3]:
class ANNc(nn.Module):
    def __init__(self, emb_size=64, **kwargs):
        """CNN  based analogy classifier model.

                It generates a value between 0 and 1 (0 for invalid, 1 for valid) based on four input
        vectors.
                1st layer (convolutional): 128 filters (= kernels) of size h × w = 1 × 2 with strides (1, 2)
        and relu activation.
                2nd layer (convolutional): 64 filters of size (2, 2) with strides (2, 2) and relu activation.
                3rd layer (dense, equivalent to linear for PyTorch): one output and sigmoid activation.

                Argument:
                emb_size -- the size of the input vectors"""
        super().__init__()
        self.emb_size = emb_size
        self.conv1 = nn.Conv2d(1, 128, (1, 2), stride=(1, 2))
        self.conv2 = nn.Conv2d(128, 64, (2, 2), stride=(2, 2))
        self.linear = nn.Linear(64 * (emb_size // 2), 1)

    def flatten(self, t):
        """Flattens  the input tensor."""
        t = t.reshape(t.size()[0], -1)
        return t

    def forward(self, a, b, c, d, p=0):
        """

        Expected  input shape:
        - a, b, c, d: [batch_size, emb_size]
        """
        image = torch.stack([a, b, c, d], dim=2)

        # apply dropout
        if p > 0:
            image = F.dropout(image, p)

        x = self.conv1(image.unsqueeze(-3))
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.flatten(x)
        x = self.linear(x)
        output = torch.sigmoid(x)
        return output

## Import the Dataset

In [4]:
import pandas as pd

data = pd.read_csv('./shuffled_combined_dataset.csv')

data.head(20)

Unnamed: 0,Entity1,Entity2,Entity3,Entity4,Label
0,Pat_Roberts,Politician,Yuliya_Menshova,News_presenter,1
1,catatonic_schizophrenia_Q1432717,catatonic_excitement_Q57769952,Naantali,Sheffield,0
2,François_Hollande,French-American_Foundation,Brazil_national_football_team,CONMEBOL,1
3,Glee_(season_2),British_Isles,Wyk_auf_Föhr,Iburi_Subprefecture,0
4,Anagrelide,Amiodarone,Maria_Pogonowska,Faculty_of_Physics_of_University_of_Warsaw_Q93...,0
5,_Q4109978,Hebrew_language,Men_Without_Women_Q12039063,Theologian,0
6,The_Vampire_Diaries_(season_4),After_School_Special_(The_Vampire_Diaries),San_Sebastián,Basque_Country_(autonomous_community),0
7,Georg_Mancelius,Indonesia,Heroes_(season_2),Kindred_(Heroes),0
8,_Q30926938,Psychiatrist,Nadarajan_Periasamy,motorcycle_racer_Q3014296,1
9,Jean-Marc_Vacheron,Princess_of_Asturias_Award_for_Communications_...,Marilyn_Monroe,Dave_McKean,0


## Cleanup Dataset

In [5]:
# clean up dataset by replacing _ with space, and make it lower case
for col in ['Entity1', 'Entity2', 'Entity3', 'Entity4']:
    data[col] = data[col].str.replace('_', ' ').str.lower().replace(r'[()]', '', regex=True)

data.head()

Unnamed: 0,Entity1,Entity2,Entity3,Entity4,Label
0,pat roberts,politician,yuliya menshova,news presenter,1
1,catatonic schizophrenia q1432717,catatonic excitement q57769952,naantali,sheffield,0
2,françois hollande,french-american foundation,brazil national football team,conmebol,1
3,glee season 2,british isles,wyk auf föhr,iburi subprefecture,0
4,anagrelide,amiodarone,maria pogonowska,faculty of physics of university of warsaw q93...,0


In [6]:
# Split X, and Y
X = data[['Entity1', 'Entity2', 'Entity3', 'Entity4']]
y = data['Label']

X.tail()

Unnamed: 0,Entity1,Entity2,Entity3,Entity4
999995,adriano celentano,film score composer q1415090,shibusawa eiichi,economist
999996,stan laurel,film actor q10800557,shulamit katznelson,activist q15253558
999997,afdera franchetti,italy,yen ching-ling q10924380,republic of china 1912–1949
999998,itsuki akata q11373859,japan,josé ribamar de faria machado q29107255,brazil
999999,q15953145,fondremand,rivière-héva,la vallée-de-l'or regional county municipality


In [7]:
# Define Embedding Size
emb_size = 300

## Convert sentence to word vector using Word2Vec model

In [None]:
# Combine all sentences into a single list
all_sentences = [sentence.split() for sublist in X.values for sentence in sublist]

all_sentences

In [None]:
from gensim.models import Word2Vec
import gensim.downloader as api

word2vec_model = Word2Vec(sentences=all_sentences, vector_size=emb_size, window=5, min_count=1, workers=4, sg=1)

# Save the model
word2vec_model.save('word2vec_model')

In [None]:
print(word2vec_model.wv["pat"])

In [None]:
import numpy as np
from gensim.models import Word2Vec

word2vec_model = Word2Vec.load("word2vec_model")

# Define sentence_to_vector Function
def sentence_to_vector(sentence):
    """
    This function takes a sentence and returns the average of all word vectors in the sentence.
    """
    words = filter(lambda x: x in word2vec_model.wv, sentence.split())
    word_vectors = np.array([word2vec_model.wv[w] for w in words])
    if len(word_vectors) == 0:
        return np.zeros(word2vec_model.vector_size)
    
    return np.mean(word_vectors, axis=0)

In [None]:
X.head()

In [None]:
X_emb = X.copy()

# Convert sentences to vectors
for column in X.columns:
    X_emb[column] = X_emb[column].apply(sentence_to_vector)

y_emb = y.copy()

## Another Method of Converting Sentence to Word Vector using Scipy

In [8]:
import spacy
import numpy as np

# Load the pre-trained spaCy model (English)
nlp = spacy.load("en_core_web_md")

# Define process_sentence Function which converts a sentence to a word vector
def process_sentence(sentence):
    """
    This function takes a sentence and returns a tensor of word embeddings.
    """
    with nlp.disable_pipes():
        doc = nlp(sentence)
        word_vectors = [word.vector for word in doc]
        return np.mean(word_vectors, axis=0)

X_emb = X.copy().map(process_sentence)
y_emb = y.copy()

In [10]:
X_emb.tail()

Unnamed: 0,Entity1,Entity2,Entity3,Entity4
999995,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-3.6452, -1.5158, 0.45935, -2.026125, 0.61745...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.0559, -0.9508, 0.80744, 0.2634, -0.55012, ..."
999996,"[0.18266, -1.18043, -0.15500003, 2.93255, 3.80...","[-2.8140333, -0.064466655, -1.7309667, -4.413,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.4905, 0.9294, -1.2411, 0.094085, 1.96755, ..."
999997,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-3.9826, 1.1531, 1.7868, 3.7107, 7.1926, 1.02...","[-2.19712, 1.1288999, -3.39506, 2.48582, 1.946...","[-4.98295, -4.0700502, -0.61006504, 1.76276, 1..."
999998,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.4837, -4.541, -3.2058, 2.3037, 4.7184, -2....","[1.2412833, -3.1828334, 0.12455667, 2.0108333,...","[-1.5349, -2.0664, -2.2445, 3.077, 5.2252, -1...."
999999,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-3.3196335, 0.2565, 0.6242, 3.62443, 4.8059, ...","[-1.7039224, -2.6593997, -1.1798732, 4.1722155..."


## Prepare Train vs Test Dataset

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_emb, y_emb, test_size=0.2, random_state=42)

In [12]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


# Convert DataFrame to PyTorch Sensors

In [13]:
# Convert numpy objects to torch tensors
X_train_tensors = [torch.tensor(X_train[col].tolist(), dtype=torch.float32).to(device) for col in X_train.columns]
X_test_tensors = [torch.tensor(X_test[col].tolist(), dtype=torch.float32).to(device) for col in X_test.columns]

y_train_tensor = torch.tensor(y_train.tolist(), dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.tolist(), dtype=torch.float32).to(device)


  X_train_tensors = [torch.tensor(X_train[col].tolist(), dtype=torch.float32).to(device) for col in X_train.columns]


In [15]:
len(X_train)

800000

## Train the Model

In [16]:
import torch.optim as optim

model = ANNc(emb_size=emb_size).to(device)

# Define the loss function
criterion = nn.BCELoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define the number of epochs
n_epochs = 10

# Define the batch size
batch_size = 1000

# Define the number of batches
n_batches = len(X_train) // batch_size

print(batch_size, n_epochs, n_batches)

# Train the model
for epoch in range(n_epochs):
    running_loss = 0.0
    for i in range(n_batches):
        start = i * batch_size
        end = start + batch_size

        # Get the inputs and labels
        inputs = [X_train_tensors[j][start:end] for j in range(4)]
        labels = y_train_tensor[start:end]

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = model(*inputs)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
        if i % 200 == 199:
            print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 200))
            running_loss = 0.0

1000 10 800
[1,   200] loss: 0.612
[1,   400] loss: 0.556
[1,   600] loss: 0.533
[1,   800] loss: 0.517
[2,   200] loss: 0.511
[2,   400] loss: 0.507
[2,   600] loss: 0.502
[2,   800] loss: 0.500
[3,   200] loss: 0.499
[3,   400] loss: 0.497
[3,   600] loss: 0.492
[3,   800] loss: 0.489
[4,   200] loss: 0.490
[4,   400] loss: 0.489
[4,   600] loss: 0.485
[4,   800] loss: 0.482
[5,   200] loss: 0.483
[5,   400] loss: 0.481
[5,   600] loss: 0.478
[5,   800] loss: 0.476
[6,   200] loss: 0.477
[6,   400] loss: 0.475
[6,   600] loss: 0.472
[6,   800] loss: 0.470
[7,   200] loss: 0.471
[7,   400] loss: 0.471
[7,   600] loss: 0.469
[7,   800] loss: 0.467
[8,   200] loss: 0.468
[8,   400] loss: 0.469
[8,   600] loss: 0.467
[8,   800] loss: 0.465
[9,   200] loss: 0.466
[9,   400] loss: 0.467
[9,   600] loss: 0.466
[9,   800] loss: 0.464
[10,   200] loss: 0.464
[10,   400] loss: 0.465
[10,   600] loss: 0.464
[10,   800] loss: 0.462


## Evaluate the Model

In [17]:
# Test the model
correct = 0
total = 0

with torch.no_grad():
    for i in range(len(X_test)):
        inputs = [X_test_tensors[j][i:i+1] for j in range(4)]
        labels = y_test_tensor[i]

        outputs = model(*inputs)
        predicted = torch.round(outputs)

        total += 1
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the test data: %d %%' % (100 * correct / total))


Accuracy of the network on the test data: 75 %


In [18]:
# Describe the model accuracy
from sklearn.metrics import classification_report

y_pred = []
with torch.no_grad():
    for i in range(len(X_test)):
        inputs = [X_test_tensors[j][i:i+1] for j in range(4)]
        labels = y_test_tensor[i]

        outputs = model(*inputs)
        predicted = torch.round(outputs)
        y_pred.append(predicted.item())

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.87      0.78    100373
           1       0.83      0.65      0.73     99627

    accuracy                           0.76    200000
   macro avg       0.77      0.76      0.76    200000
weighted avg       0.77      0.76      0.76    200000

