In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

In [40]:
df = pd.read_csv('data/GHDomains.csv')

# drop unreachable repos
df = df[df['Status']==True].reset_index(drop=True)
text = df['clean_description'] + df['clean_readme']

# drop empty descp + readme text repos
to_drop = []
for i in text.index:
    if not isinstance(text.loc[i], str):
        to_drop.append(i)

text = text.drop(to_drop).reset_index(drop=True)
y = df['Domain'].drop(to_drop).reset_index(drop=True)

In [41]:
# split data
X_train, X_test, y_train, y_test = train_test_split(text, y, test_size=0.1, random_state=42, stratify=y)

In [42]:
# encode text
model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(X_train.to_numpy())
np.save('data/X_train_embeddings',embeddings)

embeddings = model.encode(X_test.to_numpy())
np.save('data/X_test_embeddings', embeddings)

In [43]:
y_train.to_csv('data/y_train.csv')
y_test.to_csv('data/y_test.csv')

In [2]:
class CustomDataset(Dataset):
    def __init__(self, x_file, y_file):
        self.x_data = np.load(x_file)
        self.y_data = pd.read_csv(y_file,usecols=['Domain'])
        self.encoded_labels = self.target_transform()

    def __len__(self):
        return len(self.y_data)

    def __getitem__(self, idx):
        x = torch.Tensor(self.x_data[idx])
        y = torch.tensor(self.encoded_labels.iloc[idx])

        return x, y
    
    def target_transform(self):
        mapping = {}
        labels = self.y_data['Domain'].unique()
        for cat, num in zip(labels, range(len(labels))):
            mapping[cat] = num

        return self.y_data['Domain'].map(mapping)


In [3]:
train_dataset = CustomDataset('data/X_train_embeddings.npy', 'data/y_train.csv')
test_dataset = CustomDataset('data/X_test_embeddings.npy', 'data/y_test.csv')

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [46]:
next(iter(train_loader))[0][0].shape

torch.Size([384])

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score

class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.activation = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = self.layer1(x)
        x = self.activation(x)
        x = self.layer2(x)
        return x

input_size = 384
hidden_size = 96
output_size = 6

model = NeuralNetwork(input_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    running_loss = 0.0
    y_true = []
    y_pred = []
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        
        optimizer.step()
        running_loss += loss.item()
        outputs = torch.nn.functional.softmax(outputs, dim=1)
        _, predicted = torch.max(outputs.data, 1)
        y_true.extend(labels.numpy())
        y_pred.extend(predicted.numpy())
    
    train_acc = accuracy_score(y_true, y_pred)
    cm_train = confusion_matrix(y_true, y_pred)
    print('Epoch: {}, Loss: {:.6f}, Train Accuracy: {:.2f}%'.format(epoch+1, running_loss/len(train_loader), train_acc * 100))
    print('Confusion matrix for training set:')
    print(cm_train)

    # compute precision and recall for each label
    precision = precision_score(y_true, y_pred, average=None)
    recall = recall_score(y_true, y_pred, average=None)
    for i in range(6):
        print('Label: {}, Precision: {:.2f}%, Recall: {:.2f}%'.format(i, precision[i]*100, recall[i]*100))


Epoch: 1, Loss: 1.443125, Train Accuracy: 50.73%
Confusion matrix for training set:
[[   0  181   73  125    0    0]
 [   0  906  149  213    0    0]
 [   0  350  252  259    0    0]
 [   0  151  133 1072    0    0]
 [   0  189   59  126    1    0]
 [   0  100   30   29    0    0]]
Label: 0, Precision: 0.00%, Recall: 0.00%
Label: 1, Precision: 48.27%, Recall: 71.45%
Label: 2, Precision: 36.21%, Recall: 29.27%
Label: 3, Precision: 58.77%, Recall: 79.06%
Label: 4, Precision: 100.00%, Recall: 0.27%
Label: 5, Precision: 0.00%, Recall: 0.00%
Epoch: 2, Loss: 1.099865, Train Accuracy: 60.46%
Confusion matrix for training set:
[[   6  125  138  103    7    0]
 [   2  931  136  194    5    0]
 [   0  162  518  178    3    0]
 [   1  111   93 1148    3    0]
 [   1  136   73  109   56    0]
 [   0   52   84   22    1    0]]
Label: 0, Precision: 60.00%, Recall: 1.58%
Label: 1, Precision: 61.37%, Recall: 73.42%
Label: 2, Precision: 49.71%, Recall: 60.16%
Label: 3, Precision: 65.45%, Recall: 84.66%

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 3, Loss: 0.969519, Train Accuracy: 64.96%
Confusion matrix for training set:
[[  54   90  128   85   22    0]
 [  11  937  130  169   21    0]
 [   7  141  562  143    8    0]
 [   7  111   89 1139   10    0]
 [   5   82   49   74  165    0]
 [   3   49   79   22    6    0]]
Label: 0, Precision: 62.07%, Recall: 14.25%
Label: 1, Precision: 66.45%, Recall: 73.90%
Label: 2, Precision: 54.19%, Recall: 65.27%
Label: 3, Precision: 69.79%, Recall: 84.00%
Label: 4, Precision: 71.12%, Recall: 44.00%
Label: 5, Precision: 0.00%, Recall: 0.00%
Epoch: 4, Loss: 0.900274, Train Accuracy: 67.89%
Confusion matrix for training set:
[[ 110   68  111   70   20    0]
 [  23  960  105  157   22    1]
 [  30  118  574  123   15    1]
 [  17  113   75 1139   12    0]
 [  14   65   39   64  193    0]
 [   9   39   73   21    7   10]]
Label: 0, Precision: 54.19%, Recall: 29.02%
Label: 1, Precision: 70.43%, Recall: 75.71%
Label: 2, Precision: 58.75%, Recall: 66.67%
Label: 3, Precision: 72.36%, Recall: 84.

  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 5, Loss: 0.859674, Train Accuracy: 68.39%
Confusion matrix for training set:
[[ 121   66  104   61   24    3]
 [  21  975  100  151   18    3]
 [  37  129  558  121   12    4]
 [  19  118   74 1129   15    1]
 [  17   64   32   57  204    1]
 [  10   35   66   18    9   21]]
Label: 0, Precision: 53.78%, Recall: 31.93%
Label: 1, Precision: 70.30%, Recall: 76.89%
Label: 2, Precision: 59.74%, Recall: 64.81%
Label: 3, Precision: 73.45%, Recall: 83.26%
Label: 4, Precision: 72.34%, Recall: 54.40%
Label: 5, Precision: 63.64%, Recall: 13.21%
Epoch: 6, Loss: 0.830860, Train Accuracy: 69.85%
Confusion matrix for training set:
[[ 140   53  106   59   18    3]
 [  26  987   95  137   16    7]
 [  35  116  577  112   15    6]
 [  19  125   68 1128   15    1]
 [  19   64   25   55  210    2]
 [   9   28   67   17    8   30]]
Label: 0, Precision: 56.45%, Recall: 36.94%
Label: 1, Precision: 71.89%, Recall: 77.84%
Label: 2, Precision: 61.51%, Recall: 67.02%
Label: 3, Precision: 74.80%, Recall: 8

In [14]:
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        outputs = torch.nn.functional.softmax(outputs, dim=1)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print('Accuracy on test set: {:.2f}%'.format(100 * accuracy))

Accuracy on test set: 10.22%
