In [None]:
!pip install datasets
!pip install torch
!pip install -q -U transformers accelerate
!pip install transformers[torch]
!pip install pandas
!pip install langdetect

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from datasets import Dataset
import pandas as pd
import torch
from langdetect import detect, LangDetectException
from google.colab import drive

# Function to detect language and filter non-English rows
def filter_english(df, text_column):
    def is_english(text):
        try:
            return detect(text) == 'en'
        except LangDetectException:
            return False
    return df[df[text_column].apply(is_english)]

drive.mount('/content/drive/')

# Load first movie dataset
datasetMovieOne = pd.read_csv('/content/drive/My Drive/Fine-tuning/movies_metadata.csv', low_memory=False)
datasetMovieOne = datasetMovieOne.dropna(subset=['title', 'overview', 'vote_average', 'vote_count']).reset_index(drop=True)
datasetMovieOne = filter_english(datasetMovieOne, 'overview')
datasetMovieOne = datasetMovieOne[['title', 'overview', 'vote_average', 'vote_count']]

# Load second movie dataset
df1 = pd.read_csv('/content/drive/My Drive/Fine-tuning/tmdb_5000_credits.csv')
df2 = pd.read_csv('/content/drive/My Drive/Fine-tuning/tmdb_5000_movies.csv')
df1.columns = ['id', 'title', 'cast', 'crew']
datasetMovieTwo = df2.merge(df1, on='id')
datasetMovieTwo = datasetMovieTwo.dropna(subset=['original_title', 'overview', 'vote_average', 'vote_count']).reset_index(drop=True)
datasetMovieTwo = filter_english(datasetMovieTwo, 'overview')
datasetMovieTwo = datasetMovieTwo[['original_title', 'overview', 'vote_average', 'vote_count']]

# Load the book dataset
df1 = pd.read_csv('/content/drive/My Drive/Fine-tuning/book_data1.csv')
df2 = pd.read_csv('/content/drive/My Drive/Fine-tuning/book_data2.csv')
df3 = pd.read_csv('/content/drive/My Drive/Fine-tuning/book_data3.csv')
df4 = pd.read_csv('/content/drive/My Drive/Fine-tuning/book_data4.csv')
df5 = pd.read_csv('/content/drive/My Drive/Fine-tuning/book_data5.csv')
df6 = pd.read_csv('/content/drive/My Drive/Fine-tuning/book_data6.csv')
df7 = pd.read_csv('/content/drive/My Drive/Fine-tuning/book_data7.csv')
df8 = pd.read_csv('/content/drive/My Drive/Fine-tuning/book_data8.csv')
df9 = pd.read_csv('/content/drive/My Drive/Fine-tuning/book_data9.csv')
df10 = pd.read_csv('/content/drive/My Drive/Fine-tuning/book_data10.csv')
df11 = pd.read_csv('/content/drive/My Drive/Fine-tuning/book_data11.csv')
datasetBooks = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11], ignore_index=True)
datasetBooks = datasetBooks.dropna(subset=['Name', 'Description', 'Rating', 'CountsOfReview']).reset_index(drop=True)
datasetBooks = datasetBooks.sample(n=50000, random_state=42).reset_index(drop=True)
datasetBooks = filter_english(datasetBooks, 'Description')
datasetBooks = datasetBooks[['Name', 'Description', 'Rating', 'CountsOfReview']]
print("grabbed datasets")

In [None]:
# Extract titles, overviews, and ratings for tokenizing
def extract_text(df, text_columns):
    return df[text_columns].apply(lambda x: ' '.join(x.dropna()), axis=1).tolist()

movie_one_texts = extract_text(datasetMovieOne, ['title', 'overview'])
movie_two_texts = extract_text(datasetMovieTwo, ['original_title', 'overview'])
book_texts = extract_text(datasetBooks, ['Name', 'Description'])
all_texts = movie_one_texts + movie_two_texts + book_texts
print("finished extracting and combining text")

# Tokenize the text data
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
def tokenize_function(examples):
    return tokenizer(examples, truncation=True, padding="max_length", max_length=512)
tokenized_texts = [tokenize_function(text) for text in all_texts]
input_ids = torch.tensor([t["input_ids"] for t in tokenized_texts])
attention_mask = torch.tensor([t["attention_mask"] for t in tokenized_texts])
tokenized_dataset = Dataset.from_dict({"input_ids": input_ids, "attention_mask": attention_mask})
print("finished tokenizing")

In [None]:
# Train the model
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
training_args = TrainingArguments(
    num_train_epochs=3,
    per_device_train_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    output_dir='/content/drive/My Drive/Fine-tuning/model_results',
)
model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased")
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)
trainer.train()
print("finished training")

# Save the model
trainer.save_model("bert_fine-tuned")

In [None]:
!pip install huggingface_hub
from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')
model.push_to_hub("emma7897/CSI4999", token = HF_TOKEN)
tokenizer.push_to_hub("emma7897/CSI4999", token = HF_TOKEN)