In [1]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import AutoModel, AutoTokenizer
import numpy as np
import random
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)

set_seed(1)

## Model preparation

In [2]:
model_name = "bert-base-chinese"#bert-base-chinese bert-base-uncased
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Data Processing

In [38]:
#df = pd.read_csv('twitter_normal_data.csv', header = None)

import pickle
with open('comment_final.pickle', 'rb') as f:
    df = pickle.load(f)
    df.columns = [0,1]
emojis = df[1].unique()
for i in range(len(emojis)):
    if len(df[df[1] == emojis[i]]) <= 10:
        df = df.drop(df[df[1] == emojis[i]].index, axis = 0)
emoji_dict = {}
emojis = df[1].unique()
for i in range(len(emojis)):
    emoji_dict[emojis[i]] = i
tokenizer.add_tokens(list(emojis))
tokenizer.save_pretrained('chinese-tokenizer')

df[1] = df[1].replace(emoji_dict)
with open('emoji_dict.pickle', 'wb') as f:
     pickle.dump(emoji_dict,f)

In [2]:
df = pd.read_csv('all.csv')
df.columns = [0,1,2]

with open('emoji_dict.pickle', 'rb') as f:
     emoji_dict = pickle.load(f)
df[1] = df[1].replace(emoji_dict)
df  = df.dropna()

In [3]:
for i in range(len(emoji_dict)):
    if len(df[df[1] == i]) == 1:
        df = df.drop(df[df[1] == i].index, axis = 0)
    if len(df[df[1] == i]) == 0:
        df = df.append(pd.DataFrame([['',i,''],['',i,'']]), ignore_index=True)
df = df[df[1].str.isnumeric().fillna(True)]

In [4]:
model_name = 'customized/checkpoint-15000'
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=319).to("cuda")#len(df[1].unique())
tokenizer = AutoTokenizer.from_pretrained('chinese-tokenizer')
#model.resize_token_embeddings(len(tokenizer))

In [5]:
train_text, test_text, trainY, testY = train_test_split(df[0].values, df[1].values, test_size=0.2, shuffle = True, stratify = df[1])

In [6]:
trainX = tokenizer(train_text.tolist(), truncation=True, padding=True, max_length=512)
testX = tokenizer(test_text.tolist(), truncation=True, padding=True, max_length=512)

In [7]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset

train_dataset = ReviewDataset(trainX, trainY)
test_dataset = ReviewDataset(testX, testY)

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    acc = accuracy_score(labels, preds)
    macro = f1_score(labels, preds, average='macro')
    micro = f1_score(labels, preds, average='micro')
    weighted = f1_score(labels, preds, average='weighted')
    return {
      'accuracy': acc,
        'macro': macro,
        'micro': micro,
        'weighted': weighted,
    }

In [9]:
training_args = TrainingArguments(
    output_dir='D:/reviewsData/result/chinese/customized',          # output directory
    num_train_epochs=9,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=4,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=500,               # log & save weights each logging_steps
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [10]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 6811
  Num Epochs = 9
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 15327


Step,Training Loss,Validation Loss,Accuracy,Macro,Micro,Weighted
500,0.4363,2.791839,0.604815,0.417247,0.604815,0.592545
1000,0.7192,2.813612,0.59542,0.407451,0.59542,0.585283
1500,0.7532,2.830524,0.598943,0.410634,0.598943,0.584184
2000,0.7291,2.774474,0.593658,0.408865,0.593658,0.580663
2500,0.7317,2.80247,0.596007,0.403153,0.596007,0.584797
3000,0.6734,2.879263,0.597769,0.409481,0.597769,0.588098
3500,0.6577,2.879191,0.59953,0.4181,0.59953,0.588382
4000,0.5729,2.808121,0.603641,0.427583,0.603641,0.59895
4500,0.6494,2.896103,0.601292,0.406254,0.601292,0.588419
5000,0.6275,2.944963,0.596594,0.403492,0.596594,0.583616


***** Running Evaluation *****
  Num examples = 1703
  Batch size = 4
Saving model checkpoint to D:/reviewsData/result/chinese/customized\checkpoint-500
Configuration saved in D:/reviewsData/result/chinese/customized\checkpoint-500\config.json
Model weights saved in D:/reviewsData/result/chinese/customized\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1703
  Batch size = 4
Saving model checkpoint to D:/reviewsData/result/chinese/customized\checkpoint-1000
Configuration saved in D:/reviewsData/result/chinese/customized\checkpoint-1000\config.json
Model weights saved in D:/reviewsData/result/chinese/customized\checkpoint-1000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1703
  Batch size = 4
Saving model checkpoint to D:/reviewsData/result/chinese/customized\checkpoint-1500
Configuration saved in D:/reviewsData/result/chinese/customized\checkpoint-1500\config.json
Model weights saved in D:/reviewsData/result/chinese/customized\check

***** Running Evaluation *****
  Num examples = 1703
  Batch size = 4
Saving model checkpoint to D:/reviewsData/result/chinese/customized\checkpoint-12500
Configuration saved in D:/reviewsData/result/chinese/customized\checkpoint-12500\config.json
Model weights saved in D:/reviewsData/result/chinese/customized\checkpoint-12500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1703
  Batch size = 4
Saving model checkpoint to D:/reviewsData/result/chinese/customized\checkpoint-13000
Configuration saved in D:/reviewsData/result/chinese/customized\checkpoint-13000\config.json
Model weights saved in D:/reviewsData/result/chinese/customized\checkpoint-13000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1703
  Batch size = 4
Saving model checkpoint to D:/reviewsData/result/chinese/customized\checkpoint-13500
Configuration saved in D:/reviewsData/result/chinese/customized\checkpoint-13500\config.json
Model weights saved in D:/reviewsData/result/chinese/custo

In [38]:
from tqdm import tqdm
import transformers
test_preds = np.zeros((len(testX)))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4, shuffle=False)
tk0 = tqdm(test_loader)
for i, x_batch in enumerate(tk0):
    print(x_batch)
    pred = model(x_batch, labels=None)
    test_preds[i * 4:(i + 1) * 4] = pred[:, 0].detach().cpu().squeeze().numpy()

test_pred = torch.sigmoid(torch.tensor(test_preds)).numpy().ravel()

  0%|                                                                                         | 0/2422 [00:00<?, ?it/s]

{'input_ids': tensor([[  101, 10163, 10587,  2156,  3904,  1997,  8038,  3363,  1056, 28394,
          2102,  2010,  4581,  2043,  8038,  3363,  2001,  2655,  2378,  2032,
         11669,  2005,  2086,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  




AttributeError: 'dict' object has no attribute 'size'

In [21]:
training_args = torch.load('checkpoint-19500/training_args.bin')
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=True,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=./logs,
logging_first

In [32]:
test_dataset[0]

{'input_ids': tensor([  101, 10163, 10587,  2156,  3904,  1997,  8038,  3363,  1056, 28394,
          2102,  2010,  4581,  2043,  8038,  3363,  2001,  2655,  2378,  2032,
         11669,  2005,  2086,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [15]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(testY, y_pred, average='weighted')

ValueError: Found input variables with inconsistent numbers of samples: [9687, 6]

In [None]:
# saving the fine tuned model & tokenizer
model_path = "10000-reviews-bert-base-uncased"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return target_names[probs.argmax()]

In [None]:
# Example #1
text = """
The first thing is first. 
If you purchase a Macbook, you should not encounter performance issues that will prevent you from learning to code efficiently.
However, in the off chance that you have to deal with a slow computer, you will need to make some adjustments. 
Having too many background apps running in the background is one of the most common causes. 
The same can be said about a lack of drive storage. 
For that, it helps if you uninstall xcode and other unnecessary applications, as well as temporary system junk like caches and old backups.
"""
print(get_prediction(text))

In [16]:
import numpy as np
import torch
token_frequency = {'a':11, 'b':1}
token_feature = {'a':torch.tensor([1,2,3]), 'b':torch.tensor([3,2,1])}
token_feature_list = token_feature.values()
token_feature_list = np.array([item.cpu().detach().numpy() for item in token_feature_list])
token_feature_list[token_frequency > 10]

TypeError: '>' not supported between instances of 'dict' and 'int'