In [None]:
import os
import warnings

warnings.filterwarnings("ignore")
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

# Set `PATH` to include the directory containing saved_model_cli
PATH = %env PATH
%env PATH=/home/jupyter/.local/bin:{PATH}

In [None]:
!pip install datasets

In [None]:
import datetime
import shutil

import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_text as text


#from official.nlp import optimization
tf.get_logger().setLevel("ERROR")

In [None]:
!git clone https://github.com/eminedemirbas/Hackathon-YZTA

In [None]:
repo_path = "/content/Hackathon-YZTA"


data_path = os.path.join(repo_path, 'data')


if os.path.exists(data_path):
    print(f"Data klasörü bulundu: {data_path}")
else:
    print("Data klasörü bulunamadı.")

In [None]:
# Veriyi yüklemek için klasör yolunu kullanacağız
train_dir = os.path.join(data_path, 'train')
test_dir = os.path.join(data_path, 'test')

In [None]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict

def load_data_from_directories(train_dir, test_dir):
    def read_from_category(base_path, category):
        texts, labels = [], []
        for label_name in ["pos", "neg"]:
            label_value = 1 if label_name == "pos" else 0
            folder = os.path.join(base_path, label_name)
            for filename in os.listdir(folder):
                if filename.endswith(".txt"):
                    file_path = os.path.join(folder, filename)
                    with open(file_path, "r", encoding="utf-8") as file:
                        texts.append(file.read())
                        labels.append(label_value)
        return pd.DataFrame({"text": texts, "label": labels})

    train_df = read_from_category(train_dir, "train")
    test_df = read_from_category(test_dir, "test")

    train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
    test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

    dataset = DatasetDict({
        "train": Dataset.from_pandas(train_df),
        "test": Dataset.from_pandas(test_df)
    })

    return dataset

dataset = load_data_from_directories(train_dir, test_dir)

In [None]:
from transformers import AutoTokenizer

model_ckpt = "dbmdz/bert-base-turkish-cased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=2)

from transformers import TrainingArguments
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./sentiment_results",
    num_train_epochs=3,  # İstediğin epoch sayısını buraya yazabilirsin
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    report_to=None
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
# Modeli kaydetme
model.save_pretrained('/content/drive/MyDrive/saved_model')
tokenizer.save_pretrained('/content/drive/MyDrive/saved_model')

In [None]:
results = trainer.evaluate(tokenized_dataset["test"])

# Sonuçları yazdırma
print(f"Test doğruluğu: {results['eval_accuracy']}")

In [None]:
import torch

# Cihazı otomatik belirle (GPU varsa kullan, yoksa CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Modeli cihaza taşı
model.to(device)

text = "Bugün karbon izin az çıktı, bilinçlisin!"

# Metni tokenle ve cihaza taşı
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
inputs = {key: val.to(device) for key, val in inputs.items()}

# Tahmin al
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Sonuçları işle
predicted_class_id = logits.argmax(dim=-1).item()
labels = ["neg", "pos"]  # Sıralama senin modeline göre değişebilir

predicted_label = labels[predicted_class_id]
print(f"Metin: {text}")
print(f"Modelin tahmini: {predicted_label}")


In [None]:
!zip -r /content/saved_model.zip /content/drive/MyDrive/saved_model

In [None]:
from google.colab import files
files.download("/content/saved_model.zip")