In [1]:
import torch
import time
import datetime
import seaborn
from sklearn import metrics
from sklearn.utils import class_weight
import matplotlib.pyplot as plt
import numpy as np
import pickle
from collections import Counter
from transformers import CamembertForSequenceClassification, CamembertTokenizer, AutoConfig, AutoModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split, \
                            DataLoader, RandomSampler, SequentialSampler

In [2]:
# features = np.array(pickle.load(open("features.p", "rb")), dtype=object)
# labels = np.array(pickle.load(open("labels.p", "rb")), dtype=object)
# docs = np.array(pickle.load(open("docs.p", "rb")), dtype=object)

#docs_train = np.array(pickle.load(open("docs_train.p", "rb")), dtype=object)
#docs_test = np.array(pickle.load(open("docs_test.p", "rb")), dtype=object)
#docs_val = np.array(pickle.load(open("docs_val.p", "rb")), dtype=object)

#features_train = np.array(pickle.load(open("features_train.p", "rb")), dtype=object)
#features_test = np.array(pickle.load(open("features_test.p", "rb")), dtype=object)
#features_validation = np.array(pickle.load(open("features_val.p", "rb")), dtype=object)

#labels_train = np.array(pickle.load(open("labels_train.p", "rb")), dtype=object)
#labels_test = np.array(pickle.load(open("labels_test.p", "rb")), dtype=object)
#labels_validation = np.array(pickle.load(open("labels_val.p", "rb")), dtype=object)



docs = np.array(["ceci est une fake news"]*50).reshape((50, 1))
labels = np.array([2]*10 + [1]*15 + [0]*25).reshape((50, 1))
features = np.ones((50, 4)).astype("float")

dataset = np.hstack((labels, docs, features))

np.random.shuffle(dataset)

print(f'Dataset: {dataset.shape}')

labels = np.array(dataset[:, 0], dtype=int) # - 1
docs = dataset[:, 1]
features = np.array(dataset[:, 2:], dtype = np.float32)
print(features.dtype)


num_extra_dims = np.shape(features)[1]
num_labels = len(set(labels))

Dataset: (50, 6)
float32


In [3]:
print(Counter(labels))

Counter({0: 25, 1: 15, 2: 10})


In [4]:
class_weights=class_weight.compute_class_weight('balanced',np.unique(labels),labels)
class_weights=torch.tensor(class_weights,dtype=torch.float)
 
print(class_weights) #([1.0000, 1.0000, 4.0000, 1.0000, 0.5714])

tensor([0.6667, 1.1111, 1.6667])


 1 1 1 0 2 2 0 0 0 0 0 0 1] as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


In [5]:
TOKENIZER = CamembertTokenizer.from_pretrained(
    'camembert-base',
    do_lower_case=True)

In [6]:
def preprocess_spacy(docs, pos=["PUNCT", "ADV", "ADJ", "VERB", "NOUN"]):
    texts = [" ".join([token.text for token in doc if not token.is_stop and token.pos_ in pos]) for doc in docs]

    return texts

def preprocess(raw_articles, features = None, labels=None):
    """
        Create pytorch dataloader from raw data
    """

    # https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.batch_encode_plus.truncation

    encoded_batch = TOKENIZER.batch_encode_plus(raw_articles,
                                                add_special_tokens=False,
                                                padding = True,
                                                truncation = True,
                                                max_length = 512,
                                                return_attention_mask=True,
                                                return_tensors = 'pt')
        

    if features is not None:
        features = torch.tensor(features)
        if labels is not None:
            labels = torch.tensor(labels)
            return encoded_batch['input_ids'], encoded_batch['attention_mask'], features, labels
        return encoded_batch['input_ids'], encoded_batch['attention_mask'], features
    
    else :
        if labels is not None:
            labels = torch.tensor(labels)
            return encoded_batch['input_ids'], encoded_batch['attention_mask'], labels
        return encoded_batch['input_ids'], encoded_batch['attention_mask']
        

articles = docs #to comment !
#articles_train, articles_test, articles_validation = preprocess_spacy(docs_train), preprocess_spacy(docs_test), preprocess_spacy(docs_val)
#print(TOKENIZER.convert_ids_to_tokens(preprocess(articles, features = features, labels=labels)[0][0]))

In [7]:
# Split train-validation to-comment !!
split_border = int(len(labels)*0.8)
articles_train, articles_validation = articles[:split_border], articles[split_border:]
features_train, features_validation = features[:split_border], features[split_border:]
labels_train, labels_validation = labels[:split_border], labels[split_border:]

In [8]:
input_ids, attention_mask, features_train, labels_train = preprocess(articles_train, features_train, labels_train)
# Combine the training inputs into a TensorDataset
train_dataset = TensorDataset(
    input_ids,
    attention_mask,
    features_train,
    labels_train)
    

input_ids, attention_mask, features_validation, labels_validation = preprocess(articles_validation, features_validation, labels_validation)
# Combine the validation inputs into a TensorDataset
validation_dataset = TensorDataset(
    input_ids,
    attention_mask,
    features_validation,
    labels_validation)