In [33]:
import pandas as pd
import torch
import pickle
from joblib import dump, load
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.svm import OneClassSVM

In [21]:
data = pd.read_csv("Data/data.csv")

In [22]:
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [24]:
def create_embedding(data):
    encoded_input = tokenizer(data, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        output = model(**encoded_input)
    embedding = mean_pooling(output, encoded_input["attention_mask"])
    return embedding.numpy()

In [25]:
dataset = create_embedding(list(data.Question))

In [26]:
dataset.shape

(9793, 768)

In [27]:
clf = OneClassSVM(gamma="auto")
clf.fit(dataset)

OneClassSVM(gamma='auto')

In [28]:
test = ["What is your name"]
test_embedding = create_embedding(test)
clf.predict(test_embedding)

array([1])

In [34]:
dump(clf, "context_evaluation_model.joblib")

['context_evaluation_model.joblib']

In [35]:
cl = load("context_evaluation_model.joblib")

In [42]:
cl.predict(create_embedding(["Where do you live"]))

array([1])