In [1]:
import os, gc, sys
import random

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

from simpletransformers.classification import ClassificationModel
import torch

SEED = 2020
BASE_PATH = '../data/'
TEXT_COL = "description"
TARGET = "jobflag"
NUM_CLASS = 4
N_FOLDS = 4
MODEL_TYPE = "bert"
MODEL_NAME = "bert-base-uncased"
augmentation = True
memo = "single"


def metric_f1(labels, preds):
    return f1_score(labels, preds, average='macro')


def seed_everything(seed):
    """for reproducibility.
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything(SEED)

train = pd.read_csv(BASE_PATH + "train.csv").drop(['id'], axis=1)
train_aug = pd.read_csv(BASE_PATH + "train_fr_en.csv").rename(columns={TEXT_COL: 'text', TARGET: 'label'})
train = train.rename(columns={TEXT_COL: 'text', TARGET: 'label'})
train['label'] -= 1
# train["text"] = train["text"].str.lower()

train_aug["label"] -= 1

length = len(train)
train.index = range(0, length * 2, 2)
train_aug.index = range(1, length * 2, 2)

weight = len(train) / train["label"].value_counts().sort_index().values
if augmentation:
    train = pd.concat([train, train_aug])
    train = train.sort_index()

test = pd.read_csv(BASE_PATH + "test.csv")
test = test.rename(columns={TEXT_COL: 'text'}).drop(['id'], axis=1)

kfold = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
train['fold_id'] = -1
for fold, (train_idx, valid_idx) in enumerate(kfold.split(train.index, train['label'])):
    train.loc[train.iloc[valid_idx].index, 'fold_id'] = fold

X_train = train.loc[train['fold_id'] != 0]
X_valid = train.loc[train['fold_id'] == 0]

params = {
    # "output_dir": "outputs/",
    "max_seq_length": 128,
    "train_batch_size": 32,
    "eval_batch_size": 64,
    "num_train_epochs": 5,
    "learning_rate": 1e-4,
    "reprocess_input_data": True,
    "do_lower_case": True,
    "manual_seed": SEED,
    "verbose": False,
    "save_eval_checkpoints": False,
    "overwrite_output_dir": True,
}
print(weight)
print(type(weight))
model = ClassificationModel(model_type=MODEL_TYPE, model_name=MODEL_NAME, num_labels=4,
                            args=params, use_cuda=True, weight=weight.tolist())



[4.69711538 8.42241379 2.13008721 5.02744425]
<class 'numpy.ndarray'>


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [31]:
from torch.utils.data import SubsetRandomSampler, RandomSampler, WeightedRandomSampler, DataLoader

In [6]:
SubsetRandomSampler([0, 1, 2, 3], RandomSampler)

<torch.utils.data.sampler.SubsetRandomSampler at 0x7f5efd699070>

In [23]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision import datasets
import numpy as np
import matplotlib.pyplot as plt

n_classes = 5
n_samples = 8

mnist_train =  torchvision.datasets.MNIST(root="mnist/mnist_train", train=True, download=True, transform=transforms.Compose([transforms.ToTensor(),]))

balanced_batch_sampler = SubsetRandomSampler(range(len(mnist_train)))

dataloader = torch.utils.data.DataLoader(mnist_train, batch_sampler=balanced_batch_sampler)

my_testiter = iter(dataloader)
print(my_testiter, my_testiter)
images, target = my_testiter.next()


def imshow(img):
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))

imshow(torchvision.utils.make_grid(images))

<torch.utils.data.dataloader._SingleProcessDataLoaderIter object at 0x7f5e18510790> <torch.utils.data.dataloader._SingleProcessDataLoaderIter object at 0x7f5e18510790>


TypeError: 'int' object is not iterable

In [111]:
# Create dummy data with class imbalance 99 to 1
numDataPoints = 1000
data_dim = 5
bs = 100
data = torch.randn(numDataPoints, data_dim)
target = torch.cat((torch.zeros(int(numDataPoints * 0.99), dtype=torch.long),
                    torch.ones(int(numDataPoints * 0.01), dtype=torch.long)))

print('target train 0/1: {}/{}'.format(
    (target == 0).sum(), (target == 1).sum()))

memo = [[0, 1, 2, 3, -1, -2, -3, -4], [4, 5, 6, 7, -8, -5, -6, -7]]
memo = torch.tensor(memo)
print(memo)
# Create subset indices
# subset_idx = torch.cat(torch.Tensor((torch.arange(5), torch.arange(-5, 0)), (torch.arange(5), torch.arange(-5, 0))))
subset_idx = torch.cat((memo,))
# Compute samples weight (each sample should get its own weight)
class_sample_count = torch.tensor(
    [(target[subset_idx] == t).sum() for t in torch.unique(target, sorted=True)])
# weight = 1. / class_sample_count.float()
# samples_weight = torch.tensor([weight[t] for t in target[subset_idx]])

# Create sampler, dataset, loader
# sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
sampler = SubsetRandomSampler([0, 1])
train_dataset = torch.utils.data.TensorDataset(
    data[subset_idx], target[subset_idx])
train_loader = DataLoader(
    train_dataset, batch_size=2, num_workers=1, sampler=sampler)


target train 0/1: 990/10
[[0, 1, 2, 3, -1, -2, -3, -4], [4, 5, 6, 7, -8, -5, -6, -7]]


TypeError: expected Tensor as element 0 in argument 0, but got list

In [94]:
train_dataset[1]

(tensor([[-0.2628, -0.4125,  2.1010,  0.5633,  2.1434],
         [-1.2275,  1.9281, -1.2357,  2.2765,  0.3411],
         [ 1.0007,  1.8770,  1.7294,  0.8753,  0.4937],
         [ 0.7844,  1.0188,  1.8955,  0.6821,  0.5502],
         [ 0.3515, -1.0947, -0.7354, -1.2446, -0.5982],
         [-0.9117,  0.1048,  0.5835, -0.9186, -0.8285],
         [ 2.0915, -0.2989, -0.0187, -0.4907,  0.3076],
         [-0.6244, -0.4492, -0.6091,  1.2887,  1.0613]]),
 tensor([0, 0, 0, 0, 1, 1, 1, 1]))

In [110]:
iter(train_loader).next()

[tensor([[[-0.8487, -0.4098, -0.5212, -2.2310, -1.0961],
          [ 1.2471, -0.4425,  1.1788, -0.2204,  0.8089],
          [-1.0369,  0.2898, -1.0540, -0.6539, -0.1136],
          [ 0.5492,  0.1891,  0.3625,  0.5247,  1.5407],
          [-0.5279, -0.0289,  1.4816, -0.3087,  0.7798],
          [-0.9649, -0.5929, -0.3412, -0.3266, -0.0257],
          [-0.1272, -0.7001,  0.6646, -0.2318,  1.5080],
          [-0.8094,  1.5016,  0.9072, -1.3019, -0.3740]],
 
         [[ 0.7527, -0.3794,  0.5591, -1.8409, -0.5401],
          [-0.9071, -0.6531,  0.6041,  0.7742, -1.5845],
          [ 0.8196, -0.4186,  0.3631, -1.5911,  0.8281],
          [-0.7971,  0.1316,  0.0523,  1.0176,  1.8958],
          [-1.1283, -0.7768,  0.9692,  0.6532, -0.4897],
          [ 1.7120,  0.6407, -0.8223,  0.3404, -0.2856],
          [-0.8812, -0.7588,  0.9247,  0.6373, -0.7781],
          [ 0.5911, -0.5284, -1.2771, -1.4299,  0.4362]]]),
 tensor([[0, 0, 0, 0, 1, 1, 1, 1],
         [0, 0, 0, 0, 1, 1, 1, 1]])]

In [42]:
samples_weight

tensor([0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100,
        0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100,
        0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100,
        0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100,
        0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100,
        0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100,
        0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100,
        0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100,
        0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100,
        0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100,
        0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100, 0.0100,
        0.0100, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000])