In [2]:
from transformers import AutoTokenizer

import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import itertools
import numpy as np

from sklearn.model_selection import KFold
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from models import Model1, Model2
from datasets import build_dataset1, build_dataset2
from utils import pad_encode, emb_average, calc_acc
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [10]:
l, x, y = build_dataset1('/projets/melodi/gsantoss/data/yago/yago-class.nt',
                         '/projets/melodi/gsantoss/data/yago/yago-schema.nt',
                         '/projets/melodi/gsantoss/data/semclass1.pyo')
print(y.shape)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
x1 = tokenizer(x, return_tensors='pt', padding=True)
x1_ids = x1['input_ids']
x1_attention_mask = x1['attention_mask']
vocab = set(itertools.chain(*(list(map(str.split, l)) + list(map(str.split, x)))))
word_index = {q: (i + 1) for i, q in enumerate(vocab)}
l2 = pad_encode(l, word_index)
x2 = pad_encode(x, word_index)

print(l2.shape, x2.shape)
dataset = list(zip(x1_ids, x1_attention_mask, l2, x2, y))
print(len(dataset))

with open('/projets/melodi/gsantoss/embs/glove.6B.300d.txt', 'r') as f:
    lines = f.readlines()

glove = nn.Embedding(len(word_index) + 1, 300, padding_idx=0)
glove2 = nn.Embedding(len(word_index) + 1, 300, padding_idx=0)
glove.requires_grad_(False)
glove2.requires_grad_(False)

for l in tqdm(lines):
    line = lines[0].split()
    tk = line[0]
    if tk not in word_index:
        continue
    emb = torch.Tensor(list(map(float, line[1:])))
    glove.weight[word_index[tk]] = emb
    glove2.weight[word_index[tk]] = emb

In [18]:



kf = KFold(n_splits=10)

crit = nn.NLLLoss()

data = []

for fold, (train_i, test_i) in tqdm(enumerate(kf.split(dataset)), total=10):
    train = [dataset[i] for i in train_i]
    test = [dataset[i] for i in test_i]

    x1i, x1a, x2l, x2x, y = zip(*train)
    _, _, _, test_x2x, ty = zip(*test)

    cy = torch.cat(list(map(lambda q: q.unsqueeze(0), y)))
    cty = torch.cat(list(map(lambda q: q.unsqueeze(0), ty)))

    train_em = emb_average(x2x, glove)
    test_em = emb_average(test_x2x, glove)

    y_pred = GaussianNB().fit(train_em, cy).predict(test_em)
    data.append([fold, 'GaussianBN', calc_acc(y_pred, cty)])
    y_pred = make_pipeline(StandardScaler(), SVC(gamma='auto')).fit(train_em, cy).predict(test_em)
    data.append([fold, 'SVM', calc_acc(y_pred, cty)])
    y_pred = DecisionTreeClassifier().fit(train_em, cy).predict(test_em)
    data.append([fold, 'DecisionTree', calc_acc(y_pred, cty)])
    y_pred = RandomForestClassifier().fit(train_em, cy).predict(test_em)
    data.append([fold, 'RandomForest', calc_acc(y_pred, cty)])

    model1 = Model1(6)
    model1.cuda(0)
    model2 = Model2(glove2, len(word_index), 6)
    model2.cuda(1)
    optimizer1 = optim.Adam(model1.parameters(), lr=0.00003)
    optimizer2 = optim.Adam(model2.parameters(), lr=0.003)

    for x1i, x1a, x2l, x2x, y in DataLoader(train, batch_size=32, shuffle=True):
        optimizer1.zero_grad()
        optimizer2.zero_grad()

        out1 = model1(x1i.cuda(0), x1a.cuda(0)).cpu()
        out2 = model2(x2l.cuda(1), x2x.cuda(1)).cpu()

        l1 = crit(out1, y)
        l1.backward()

        l2 = crit(out2, y)
        l2.backward()

        optimizer1.step()
        optimizer2.step()

    model1.eval()
    model2.eval()

    out1 = []
    out2 = []
    with torch.no_grad():
        for x1i, x1a, x2l, x2x, y in DataLoader(test, batch_size=32, shuffle=True):
            o1 = model1(x1i.cuda(0), x1a.cuda(0)).exp().cpu()
            out1.append(o1.argmax(dim=1) == y)
            o2 = model2(x2l.cuda(1), x2x.cuda(1)).exp().cpu()
            out2.append(o2.argmax(dim=1) == y)

    out1 = torch.cat(out1).float()
    acc1 = out1.sum() / out1.shape[0]
    data.append([fold, 'Model1', acc1.item()])
    out2 = torch.cat(out2).float()
    acc2 = out2.sum() / out2.shape[0]
    data.append([fold, 'Model2', acc2.item()])

  0%|          | 0/10 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predic

In [45]:
dt = pd.DataFrame(data, columns=['Fold', 'Name', 'Acc'])

res = dt.groupby('Name').mean().sort_values('Acc')

print(res.drop(columns=['Fold']))
res.to_csv('exp1_data1.csv')

                   Acc
Name                  
DecisionTree  0.255359
GaussianBN    0.266151
RandomForest  0.399599
SVM           0.537388
Model2        0.583894
Model1        0.842978


In [62]:
c, x, y = build_dataset2('/projets/melodi/gsantoss/data/dataset2.csv')
x1 = tokenizer(list(x), return_tensors='pt', padding=True)
x1_ids = x1['input_ids']
x1_attention_mask = x1['attention_mask']
print(x1_ids.shape)
vocab = set(itertools.chain(*(list(map(str.split, c)) + list(map(str.split, x)))))

word_index = {q: (i + 1) for i, q in enumerate(vocab)}
print(len(word_index))
l2 = pad_encode(c, word_index)
x2 = pad_encode(x, word_index)

print(l2.shape, x2.shape)
dataset = list(zip(x1_ids, x1_attention_mask, l2, x2, y))
print(len(dataset))

glove = nn.Embedding(len(word_index) + 1, 300, padding_idx=0)
glove2 = nn.Embedding(len(word_index) + 1, 300, padding_idx=0)
glove.requires_grad_(False)
glove2.requires_grad_(False)

for l in tqdm(lines):
    line = lines[0].split()
    tk = line[0]
    if tk not in word_index:
        continue
    emb = torch.Tensor(list(map(float, line[1:])))
    glove.weight[word_index[tk]] = emb
    glove2.weight[word_index[tk]] = emb

In [69]:
kf = KFold(n_splits=10)

crit = nn.NLLLoss()

data = []

for fold, (train_i, test_i) in tqdm(enumerate(kf.split(dataset)), total=10):
    train = [dataset[i] for i in train_i]
    test = [dataset[i] for i in test_i]

    x1i, x1a, x2l, x2x, y = zip(*train)
    _, _, _, test_x2x, ty = zip(*test)

    cy = torch.cat(list(map(lambda q: q.unsqueeze(0), y)))
    cty = torch.cat(list(map(lambda q: q.unsqueeze(0), ty)))

    train_em = emb_average(x2x, glove)
    test_em = emb_average(test_x2x, glove)

    y_pred = GaussianNB().fit(train_em, cy).predict(test_em)
    data.append([fold, 'GaussianBN', calc_acc(y_pred, cty)])
    y_pred = make_pipeline(StandardScaler(), SVC(gamma='auto')).fit(train_em, cy).predict(test_em)
    data.append([fold, 'SVM', calc_acc(y_pred, cty)])
    y_pred = DecisionTreeClassifier().fit(train_em, cy).predict(test_em)
    data.append([fold, 'DecisionTree', calc_acc(y_pred, cty)])
    y_pred = RandomForestClassifier().fit(train_em, cy).predict(test_em)
    data.append([fold, 'RandomForest', calc_acc(y_pred, cty)])

    model1 = Model1(5)
    model1.cuda(0)
    model2 = Model2(glove2, len(word_index), 5)
    model2.cuda(1)
    optimizer1 = optim.Adam(model1.parameters(), lr=0.00003)
    optimizer2 = optim.Adam(model2.parameters(), lr=0.003)

    for x1i, x1a, x2l, x2x, y in DataLoader(train, batch_size=32, shuffle=True):
        optimizer1.zero_grad()
        optimizer2.zero_grad()

        out1 = model1(x1i.cuda(0), x1a.cuda(0)).cpu()
        out2 = model2(x2l.cuda(1), x2x.cuda(1)).cpu()

        l1 = crit(out1, y)
        l1.backward()

        l2 = crit(out2, y)
        l2.backward()

        optimizer1.step()
        optimizer2.step()

    model1.eval()
    model2.eval()

    out1 = []
    out2 = []
    with torch.no_grad():
        for x1i, x1a, x2l, x2x, y in DataLoader(test, batch_size=32, shuffle=True):
            o1 = model1(x1i.cuda(0), x1a.cuda(0)).exp().cpu()
            out1.append(o1.argmax(dim=1) == y)
            o2 = model2(x2l.cuda(1), x2x.cuda(1)).exp().cpu()
            out2.append(o2.argmax(dim=1) == y)

    out1 = torch.cat(out1).float()
    acc1 = out1.sum() / out1.shape[0]
    data.append([fold, 'Model1', acc1.item()])
    out2 = torch.cat(out2).float()
    acc2 = out2.sum() / out2.shape[0]
    data.append([fold, 'Model2', acc2.item()])

  0%|          | 0/10 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predic

In [72]:
dt = pd.DataFrame(data, columns=['Fold', 'Name', 'Acc'])

res = dt.groupby('Name').mean().sort_values('Acc')

print(res.drop(columns=['Fold']))
res.to_csv('exp2_data2.csv')

                   Acc
Name                  
SVM           0.219325
GaussianBN    0.238072
RandomForest  0.263596
DecisionTree  0.342751
Model2        0.378217
Model1        0.559342
