In [None]:
from google.colab import auth
auth.authenticate_user()

# https://cloud.google.com/resource-manager/docs/creating-managing-projects
project_id = 'wikidata-319717'
!gcloud config set project {project_id}

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip install transformers[torch] datasets
!pip install evaluate 
!pip install huggingface_hub scikit-learn

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
!gsutil cp gs://wikidata-de/relation_ids.parquet .

In [None]:
import pandas as pd
import glob
import transformers
import numpy as np

In [None]:
df = pd.read_parquet("/content/gdrive/MyDrive/psychiq/model3/df.parquet")

In [None]:
relations = pd.read_parquet('relation_ids.parquet')
# map from 0 to 999
relations.relation_id -= 1
relation_map = dict()
for i, row in relations.iterrows():
  relation_map[row['relation_id']] = (row['relation'], row['target'])
relation_map[1000] = ('unknown', 'unknown')

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from datasets import Features, ClassLabel, Value
names = [f"{p}-{qid}" for (_, (p,qid)) in sorted(relation_map.items())]
id2name = dict(list(enumerate(names)))
name2id = {v: k for k, v in id2name.items()}
features = Features({'text': Value('string'), 'labels': ClassLabel(1001, names=names)})

In [None]:
from datasets import load_from_disk
tokenized_training = load_from_disk('/content/gdrive/MyDrive/psychiq/model3/split_dataset')

In [None]:
config = AutoConfig.from_pretrained("distilbert-base-uncased",label2id=name2id, id2label=id2name)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config)

training_args = TrainingArguments(
    output_dir="/content/gdrive/MyDrive/psychiq/model3",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    save_steps =10000,
    save_total_limit=2,
    hub_model_id="derenrich/psychiq2"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training['train'],
    eval_dataset=tokenized_training['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)
trainer.train(resume_from_checkpoint = False)


In [None]:
from datasets import load_from_disk


test = load_from_disk("/content/gdrive/MyDrive/psychiq/model3/split_test")
training_args = TrainingArguments(
    output_dir="/content/gdrive/MyDrive/psychiq/model3",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    save_steps =10000,
    save_total_limit=2,
    hub_model_id="derenrich/psychiq2"
)

NSHARDS = 10
for i in range(NSHARDS):
  train = load_from_disk("/content/gdrive/MyDrive/psychiq/model3/split_train_" + (str(i)))
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train,
      eval_dataset=test,
      tokenizer=tokenizer,
      data_collator=data_collator,
  )
  trainer.train(resume_from_checkpoint = (i!=0))
  del train

In [None]:
trainer.evaluate(tokenized_training['test'])

In [None]:
preds = trainer.predict(tokenized_training['test'])