# Imports

In [None]:
import os

import pandas as pd

import chromadb
from google.colab import drive, files
from chromadb import Documents, EmbeddingFunction, Embeddings
import chromadb.utils.embedding_functions as embedding_functions

from chromadb import Documents, EmbeddingFunction, Embeddings

from transformers import (
    AutoModel,
    AutoTokenizer,
    AutoConfig,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BertForMaskedLM
)
from datasets import (
    load_dataset,
    DatasetDict,
    ClassLabel,
    Value,
    Dataset,
)


# Dataset

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP/Project/Dataset/final_dataset.csv')

In [None]:
df['total_summary'] = df.apply(lambda row: '\n'.join([item for item in [row['summary_digi'],row['summary_kio'],row['summary_tiny'],row['summary_uptv'],row['story']] if str(item) != 'nan']), axis=1)

In [None]:
config = AutoConfig.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
model = BertForMaskedLM.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")

Some weights of the model checkpoint at HooshvareLab/bert-base-parsbert-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train_dataset = Dataset.from_pandas(df[['total_summary']])

In [None]:
train_dataset

Dataset({
    features: ['total_summary'],
    num_rows: 9447
})

In [None]:
def tokenize_function(row):
    return tokenizer(row["total_summary"], padding="max_length", truncation=True, max_length=512)

In [None]:
tokenized_dataset = train_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/9447 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns(["total_summary"])

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

In [40]:
trainer.train()

Step,Training Loss
10,1.9334
20,1.6961
30,1.7895
40,1.6506
50,1.9138
60,1.9506
70,1.8237
80,1.761
90,1.7479
100,1.8069


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.53 GiB. GPU 0 has a total capacty of 15.77 GiB of which 560.38 MiB is free. Process 115074 has 15.22 GiB memory in use. Of the allocated memory 13.25 GiB is allocated by PyTorch, and 1.60 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

Save the fine-tuned model:

In [42]:
from huggingface_hub import login

login(token='#######################') # HIDDEN!

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [43]:
model.push_to_hub('parsbert-movie-finetuned')
tokenizer.push_to_hub('parsbert-movie-finetuned')

model.safetensors:   0%|          | 0.00/652M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/drippypale/parsbert-movie-finetuned/commit/4b997ce7d5c68c6d95480efd05203bac94c76778', commit_message='Upload tokenizer', commit_description='', oid='4b997ce7d5c68c6d95480efd05203bac94c76778', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
chroma_client = chromadb.PersistentClient(path="/content/drive/MyDrive/Colab Notebooks/NLP/Project/chromadb")

In [None]:
class MyEmbeddingParsBert(EmbeddingFunction):
    def __init__(self, model,tokenizer):
      self.model = model
      self.tokenizer =  tokenizer
    def get_embedding(self,text):
      inputs = self.tokenizer(text, return_tensors='pt')
      outputs = self.model(**inputs, output_hidden_states=True)
      embedding = outputs.hidden_states[-1][0,0,:]
      return embedding.tolist()

    def __call__(self, input: Documents) -> Embeddings:
        embeddings = [self.get_embedding(doc) for doc in input]
        return embeddings

In [None]:
embedding_function = MyEmbeddingParsBert(model,tokenizer)

In [None]:
collection = chroma_client.get_or_create_collection(name = "bertpersion",embedding_function=embedding_function)


In [None]:
text = "love with you for ever"

collection.add(
    documents=[text],
    metadatas=[{"actor": "ali","director":"amir"}],
    ids=['4']
    )



In [None]:
text = "mohmmad asadi love to be gay,G spot is his favorite"

collection.add(
    documents=[text],
    metadatas=[{"actor": "asadi","director":"taha"}],
    ids=['3']
    )


In [None]:
text = "a beautiful eye of a random animal"

collection.add(
    documents=[text],
    metadatas=[{"actor": "taha","director":"taha"}],
    ids=['2']
    )


In [None]:
text = "where is my treats or I will pull a gun"

collection.add(
    documents=[text],
    metadatas=[{"actor": "people","director":"ramin"}],
    ids=['1']
    )


In [None]:
text = "50thy shade of brown"

collection.add(
    documents=[text],
    metadatas=[{"actor": "mehrab","director":"mehrab"}],
    ids=['0']
    )


In [None]:
collection.query(
    query_texts=["my honey"],
    n_results=2,
    # where = {"actor": "mehrab"},
    # where_document={"$contains": "love"}
)

{'ids': [['4', '2']],
 'distances': [[170.8740095213921, 219.07338079256547]],
 'metadatas': [[{'actor': 'ali', 'director': 'amir'},
   {'actor': 'taha', 'director': 'taha'}]],
 'embeddings': None,
 'documents': [['love with you for ever',
   'a beautiful eye of a random animal']],
 'uris': None,
 'data': None}