In [3]:
import pandas as pd
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer
from tensorflow.keras.optimizers.schedules import PolynomialDecay

# Tokenizer ve modeli yükle
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')
model = TFBertForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-cased', num_labels=2)

# CSV dosyasını oku
df = pd.read_csv('DF.csv')

# Verileri temizleyin ve string formatına dönüştürün
df['yorum'] = df['yorum'].astype(str).fillna('')

# Verileri ve etiketleri hazırlayın
texts = df['yorum'].tolist()
# Burada etiketleri belirlemeniz gerekebilir. Örneğin, etiketlerinizi belirlemek için bir etiketleme yapılmalı
# Aşağıdaki örnekte, sadece sıfırları kullanıyoruz; gerçek etiketleriniz olmalı.
labels = [0] * len(texts)  # Tüm etiketleri sıfır olarak belirledik, gerçek etiketlerinizi ekleyin

# Tokenize etme
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='tf')

# TensorFlow veri kümesi oluşturma
dataset = tf.data.Dataset.from_tensor_slices((
    dict(encodings),
    labels
))

# Veriyi batch'lere ayırma
batch_size = 8
train_dataset = dataset.batch(batch_size)

# Öğrenme oranı zamanlayıcısı
learning_rate_schedule = PolynomialDecay(
    initial_learning_rate=2e-5,
    decay_steps=len(train_dataset) * 3,  # Toplam adım sayısı: epoch sayısı * adım başına batch sayısı
    end_learning_rate=0
)

# TensorFlow optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate_schedule)

# Modeli custom training loop ile eğitin
@tf.function
def train_step(inputs, labels):
    with tf.GradientTape() as tape:
        outputs = model(**inputs, labels=labels, training=True)  # labels'ı model girişine ekleyin
        loss = outputs.loss
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# Eğitim döngüsü
epochs = 3
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    for batch in train_dataset:
        batch_inputs, batch_labels = batch
        inputs = {'input_ids': batch_inputs['input_ids'], 'attention_mask': batch_inputs['attention_mask']}
        labels = batch_labels
        loss = train_step(inputs, labels)
        print(f'Loss: {loss.numpy()}')

# Modeli kaydedin
model.save_pretrained('sentiment_model')
tokenizer.save_pretrained('sentiment_model')


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Loss: [0.883518   0.4488181  0.86916566 0.93754613 0.65265006 0.6143278
 0.63691133 0.6938638 ]
Loss: [0.5620221  0.5071214  0.5181474  0.50915486 0.5474932  0.6759523
 0.45341557 0.67853785]
Loss: [0.33351701 0.39570975 0.5481652  0.40283045 0.47609505 0.47027645
 0.47110444 0.42053837]
Loss: [0.38871628 0.37787268 0.28589764 0.39707494 0.2945529  0.3910688
 0.3164054  0.3087364 ]
Loss: [0.23755452 0.29222232 0.24023171 0.26557368 0.21945299 0.29237744
 0.26795936 0.21334909]
Loss: [0.19019745 0.20617734 0.25497064 0.24153364 0.14291868 0.23285075
 0.18121363 0.15421982]
Loss: [0.14097339 0.13418524 0.13488798 0.13859029 0.15688276 0.1419569
 0.18504846 0.1044803 ]
Loss: [0.10264259 0.11587928 0.13240491 0.12473021 0.1235799  0.09002569
 0.09225693 0.08596903]
Loss: [0.08226153 0.08915723 0.07415994 0.06414566 0.05321603 0.08331864
 0.0594781  0.08491977]
Loss: [0.03316258 0.03821744 0.05687414 0.06886766 0.04402426 0.04547345
 0.04623658 0.03408737]
Loss: [0.06694419 0.0557

('sentiment_model\\tokenizer_config.json',
 'sentiment_model\\special_tokens_map.json',
 'sentiment_model\\vocab.txt',
 'sentiment_model\\added_tokens.json')

In [7]:
import pandas as pd
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer, TFBertForTokenClassification, create_optimizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Tokenizer ve modeli yükle
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')
model = TFBertForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-cased', num_labels=2)

# CSV dosyasını oku
df = pd.read_csv('DF.csv')

# Verileri temizleyin ve string formatına dönüştürün
df['yorum'] = df['yorum'].astype(str).fillna('')

# Örnek etiketleri belirleyin
# Gerçek etiketlerinizi ekleyin
# Pozitif: 1, Negatif: 0 olarak örneklendirildi
# Etiketleme işlemi gerekecektir
labels = [1 if 'iyi' in yorum else 0 for yorum in df['yorum']]  # Örnek bir etiketleme, kendi etiketleme yönteminizi kullanın

# Verileri ve etiketleri hazırlayın
texts = df['yorum'].tolist()

# Tokenize etme
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='tf')

# TensorFlow veri kümesi oluşturma
dataset = tf.data.Dataset.from_tensor_slices((
    dict(encodings),
    labels
))

# Veriyi batch'lere ayırma
batch_size = 8
dataset = dataset.shuffle(1000).batch(batch_size)

# Eğitim ve doğrulama veri setlerini ayırma
train_size = int(0.8 * len(dataset))
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

# Optimizer ve loss fonksiyonunu ayarlama
num_train_steps = len(train_dataset) * 3
optimizer, lr_schedule = create_optimizer(init_lr=2e-5, num_train_steps=num_train_steps, num_warmup_steps=0)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Modeli compile etme
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

# Modeli eğitme
model.fit(train_dataset, validation_data=val_dataset, epochs=3)

# Skorları ve sonuçları tahmin etme
predictions = model.predict(val_dataset)
pred_labels = tf.argmax(predictions.logits, axis=1).numpy()
true_labels = [label for _, label in val_dataset.unbatch()]

# Sınıflandırma raporu
print(classification_report(true_labels, pred_labels))

# Modeli kaydetme
model.save_pretrained('sentiment_model')
tokenizer.save_pretrained('sentiment_model')


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Epoch 2/3
Epoch 3/3
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       736
           1       0.13      0.11      0.12       118

    accuracy                           0.78       854
   macro avg       0.50      0.50      0.50       854
weighted avg       0.76      0.78      0.77       854



('sentiment_model\\tokenizer_config.json',
 'sentiment_model\\special_tokens_map.json',
 'sentiment_model\\vocab.txt',
 'sentiment_model\\added_tokens.json')

In [1]:
from transformers import TFBertForTokenClassification, BertTokenizer
import numpy as np

# Model ve tokenizer'ı yükleyin
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')
model = TFBertForTokenClassification.from_pretrained('dbmdz/bert-base-turkish-cased', num_labels=17)  # Etiket sayısını uygun şekilde ayarlayın

# Örnek cümle
sentence = "Fiber 100mb SuperOnline kullanıcısıyım yaklaşık 2 haftadır @Twitch @Kick_Turkey gibi canlı yayın platformlarında 360p yayın izlerken donmalar yaşıyoruz. Başka hiç bir operatörler bu sorunu yaşamazken ben parasını verip alamadığım hizmeti neden ödeyeyim"

# Tokenize etme
inputs = tokenizer(sentence, return_tensors='tf', truncation=True, padding=True)

# Tahmin yapma
outputs = model(**inputs).logits
predictions = tf.argmax(outputs, axis=2)

# Etiketler
labels = [model.config.id2label[id] for id in predictions[0].numpy()]

# Sonuçları gösterme
for token, label in zip(tokenizer.tokenize(sentence), labels):
    print(f"{token}: {label}")


  from .autonotebook import tqdm as notebook_tqdm






All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'tf' is not defined

In [3]:
import pandas as pd
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, AutoModelForSequenceClassification
import stanza
import joblib

# Yorumlarınızın olduğu DataFrame'i yükleyin
df = pd.read_csv('DF.csv')  # 'comments.csv' dosyanızın adı

# Stanza modelini indirin
stanza.download('tr')
nlp = stanza.Pipeline('tr')

# Varlık çıkarımı için model ve tokenizer
ner_tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
ner_model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-base-turkish-cased")
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)

# Duygu analizi için model ve tokenizer
sentiment_tokenizer = AutoTokenizer.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
sentiment_model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
sentiment_pipeline = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=sentiment_tokenizer)

# Varlık çıkarımı ve duyarlılık analizi fonksiyonu
def analyze_comment(comment):
    doc = nlp(comment)
    entities = [ent.text for sent in doc.sentences for ent in sent.ents]
    
    ner_results = ner_pipeline(comment)
    unique_entities = list(set([result['word'] for result in ner_results]))
    
    sentiment_results = []
    for entity in unique_entities:
        sentiment = sentiment_pipeline(entity)
        sentiment_label = sentiment[0]['label']
        if sentiment_label == 'LABEL_2':  # Positive
            sentiment_results.append({"entity": entity, "sentiment": "olumlu"})
        elif sentiment_label == 'LABEL_0':  # Negative
            sentiment_results.append({"entity": entity, "sentiment": "olumsuz"})
        else:  # Neutral
            sentiment_results.append({"entity": entity, "sentiment": "nötr"})
    
    return {"entity_list": unique_entities, "results": sentiment_results}

# Her yorum için analiz yapın ve sonuçları bir listeye kaydedin
results = []
for comment in df['yorum']:
    results.append(analyze_comment(comment))

# Modeli kaydedin
joblib.dump(analyze_comment, 'entity_sentiment_model.pkl')


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 387kB [00:00, 6.79MB/s]                    
2024-07-29 13:47:24 INFO: Downloaded file to C:\Users\asus\stanza_resources\resources.json
2024-07-29 13:47:24 INFO: Downloading default packages for language: tr (Turkish) ...
2024-07-29 13:47:27 INFO: File exists: C:\Users\asus\stanza_resources\tr\default.zip
2024-07-29 13:47:52 INFO: Finished downloading models and saved to C:\Users\asus\stanza_resources
2024-07-29 13:47:52 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 387kB [00:00, 8.32MB/s]                    
2024-07-29 13:47:52 INFO: Downloaded file to C:\Users\asus\stanza_resources\resources.json
2024-07-29 13:47:55 INFO: Loading these mod

AssertionError: input should be either str, list or Document

In [None]:
import nltk
from transformers import pipeline
import stanza

# Stanza Türkçe modelini indir ve pipeline'ı oluştur
stanza.download('tr')
nlp = stanza.Pipeline('tr', processors='tokenize,mwt,pos,lemma,depparse,ner')

# Sentiment analizciyi yükle
sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment", framework="tf")

# Örnek müşteri yorumları
comments = [
    "Fiber 100mb SuperOnline kullanıcısıyım yaklaşık 2 haftadır Twitch Kick_Turkey gibi canlı yayın platformlarında 360p yayın izlerken donmalar yaşıyoruz.  Başka hiç bir operatörler bu sorunu yaşamazken ben parasını verip alamadığım hizmeti neden ödeyeyim ? Turkcell"
]

def analyze_sentiment(sentence):
    result = sentiment_analyzer(sentence)[0]
    label = result['label']
    score = result['score']
    
    if label == '1 star' and score >= 0.5:
        return 'olumsuz'
    elif label == '2 stars' and score >= 0.5:
        return 'olumsuz'
    elif label == '4 stars' and score >= 0.5:
        return 'olumlu'
    elif label == '5 stars' and score >= 0.5:
        return 'olumlu'
    else:
        return 'nötr'

def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    return entities

def process_comments(comments):
    results = []
    entity_list = set()
    for comment in comments:
        # Cümleleri ayırma
        sentences = nltk.sent_tokenize(comment, language='turkish')
        for sentence in sentences:
            sentiment = analyze_sentiment(sentence.strip())
            entities = extract_entities(sentence)
            entity_list.update(entities)
            if entities:
                for entity in entities:
                    results.append({"entity": entity, "sentiment": sentiment})
            else:
                results.append({"entity": "N/A", "sentiment": sentiment})  # Varlık belirtmiyoruz

    return list(entity_list), results

# Yorumları işleyip sonuçları alın
entity_list, results = process_comments(comments)

# Sonuçları istenilen formatta yazdır
output = {
    "entity_list": list(entity_list),
    "results": results
}

print(output)
