<a href="https://colab.research.google.com/github/Chizuchizu/student_cup_chizuchizu/blob/master/notebooks/fujito_miss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pulp simpletransformers

In [None]:
import os, gc, sys
import random

import pandas as pd
import numpy as np
import pulp

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import MinMaxScaler

from scipy import stats

from simpletransformers.classification import ClassificationModel
import torch

In [None]:
SEED = 2020
BASE_PATH = '/content/drive/My Drive/data/'
TEXT_COL = "description"
TARGET = "jobflag"
NUM_CLASS = 4
N_FOLDS = 4
MODEL_TYPE = "bert"
MODEL_NAME = "bert-base-uncased"
augmentation = False
memo = "hack_code_"
# 1セットあたりのデータ
SET_NUM = 2
params = {
    # "output_dir": "outputs/",
    "max_seq_length": 64,
    "train_batch_size": 64,
    "eval_batch_size": 64,
    "num_train_epochs": 5,
    "learning_rate": 1e-4,
    "reprocess_input_data": True,
    "do_lower_case": True,
    "manual_seed": SEED,
    "verbose": False,
    "save_eval_checkpoints": False,
    "overwrite_output_dir": True,
}

In [None]:
def metric_f1(labels, preds):
    from sklearn.metrics import f1_score
    return f1_score(labels, preds, average='macro')

In [None]:
def seed_everything(seed):
    """for reproducibility.
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:

N_CLASSES = [404, 320, 345, 674]  # @yCarbonによる推定（過去フォーラム参照）


# 制約付き対数尤度最大化問題を解く
def hack(prob):
    # prob = np.where(prob < 0, 0, prob)
    mm = MinMaxScaler()
    prob = mm.fit_transform(prob)
    logp = np.log(prob + 1e-4)
    N = prob.shape[0]
    K = prob.shape[1]

    m = pulp.LpProblem('Problem', pulp.LpMaximize)  # 最大化問題

    # 最適化する変数(= 提出ラベル)
    x = pulp.LpVariable.dicts('x', [(i, j) for i in range(N) for j in range(K)], 0, 1, pulp.LpBinary)

    # log likelihood(目的関数)
    log_likelihood = pulp.lpSum([x[(i, j)] * logp[i, j] for i in range(N) for j in range(K)])
    m += log_likelihood

    # 各データについて，1クラスだけを予測ラベルとする制約
    for i in range(N):
        m += pulp.lpSum([x[(i, k)] for k in range(K)]) == 1  # i.e., SOS1

    # 各クラスについて，推定個数の合計に関する制約
    for k in range(K):
        m += pulp.lpSum([x[(i, k)] for i in range(N)]) == N_CLASSES[k]

    m.solve()  # 解く

    assert m.status == 1  # assert 最適 <=>（実行可能解が見つからないとエラー）

    x_ast = np.array([[int(x[(i, j)].value()) for j in range(K)] for i in range(N)])  # 結果の取得
    return x_ast.argmax(axis=1)  # 結果をonehotから -> {0, 1, 2, 3}のラベルに変換

In [None]:
seed_everything(SEED)

train = pd.read_csv(BASE_PATH + "train.csv").drop(['id'], axis=1)
# train_aug = pd.read_csv(BASE_PATH + "train_fr_en.csv").rename(columns={"transrated": 'text', TARGET: 'label'})
train = train.rename(columns={TEXT_COL: 'text', TARGET: 'label'})
train['label'] -= 1

train["text"] = train["text"].str.replace(".", "").str.strip()
# train = train[~train["text"].duplicated()]
# train["text"] = train["text"].str.lower()

# train_aug["label"] -= 1

length = len(train)
train.index = range(0, length * 2, 2)
# train_aug.index = range(1, length * 2, 2)

# groups = [i for _ in range(SET_NUM) for i in range(train.shape[0])]
weight = len(train) / train["label"].value_counts().sort_index().values

if augmentation:
    train = pd.concat([train, train_aug])
    train = train.sort_index()

test = pd.read_csv(BASE_PATH + "test.csv")
test = test.rename(columns={TEXT_COL: 'text'}).drop(['id'], axis=1)

# kfold = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
# train['fold_id'] = -1
groups = [i // SET_NUM for i in range(train.shape[0])]

y_pred = np.zeros((test.shape[0], N_FOLDS))

# print(groups)
group_kfold = GroupKFold(n_splits=N_FOLDS)
f1_score: int = 0

for fold, (train_idx, valid_idx) in enumerate(group_kfold.split(train.index, train['label'], groups)):
    # train.loc[train.iloc[valid_idx].index, 'fold_id'] = fold

    X_train = train.iloc[train_idx]
    X_valid = train.iloc[valid_idx]

    # print(weight)
    # print(type(weight))
    model = ClassificationModel(model_type=MODEL_TYPE, model_name=MODEL_NAME, num_labels=4,
                                args=params, use_cuda=True, weight=weight.tolist())

    model.train_model(X_train)

    result, model_outputs, wrong_predictions = model.eval_model(X_valid, f1=metric_f1)
    print(result)
    f1_score += result["f1"] / N_FOLDS

    fold_pred, raw_outputs = model.predict(test['text'])

    y_pred[:, fold] = hack(raw_outputs)
    # y_pred += fold_pred / N_FOLDS
    # print(y_pred)

print(f1_score)
# 最頻値
y_pred = stats.mode(y_pred, axis=1)[0].flatten().astype(int)

test = pd.read_csv(BASE_PATH + "test.csv")

submit = pd.DataFrame({'index': test['id'], 'pred': y_pred + 1})



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=2197.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=35.0, style=ProgressStyle(desc…






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=35.0, style=ProgressStyle(desc…






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=35.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=35.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=35.0, style=ProgressStyle(desc…





  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=734.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=12.0, style=ProgressStyle(descri…


{'mcc': 0.47491818394253005, 'f1': 0.5905299284680487, 'eval_loss': 1.2095270405213039}


HBox(children=(FloatProgress(value=0.0, max=1743.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=2198.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=35.0, style=ProgressStyle(desc…






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=35.0, style=ProgressStyle(desc…






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=35.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=35.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=35.0, style=ProgressStyle(desc…





  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=733.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=12.0, style=ProgressStyle(descri…


{'mcc': 0.47206344362232905, 'f1': 0.5814036781035661, 'eval_loss': 1.2821154942115147}


HBox(children=(FloatProgress(value=0.0, max=1743.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=2199.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=35.0, style=ProgressStyle(desc…






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=35.0, style=ProgressStyle(desc…






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=35.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=35.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=35.0, style=ProgressStyle(desc…





  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=732.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=12.0, style=ProgressStyle(descri…


{'mcc': 0.534181756436174, 'f1': 0.6300792095605299, 'eval_loss': 1.3264552354812622}


HBox(children=(FloatProgress(value=0.0, max=1743.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=2199.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=35.0, style=ProgressStyle(desc…






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=35.0, style=ProgressStyle(desc…






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=35.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=35.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=35.0, style=ProgressStyle(desc…





  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=732.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=12.0, style=ProgressStyle(descri…


{'mcc': 0.51789257266184, 'f1': 0.622903656195181, 'eval_loss': 0.9459997415542603}


HBox(children=(FloatProgress(value=0.0, max=1743.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))


0.6062291180818314


In [None]:
aug = "non_aug"

submit.to_csv(f"/content/outputs/submit_{aug}_{MODEL_NAME}_{round(f1_score, 3)}_{memo}.csv", index=False, header=False)