In [4]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
import torch.nn.functional as F
from transformers import pipeline
import pandas as pd

from google.colab import drive
drive.mount('/content/MyDrive')

# Cihazı ayarla
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Entity recognition model
ner_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english")
ner_model = AutoModelForTokenClassification.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english").to(device)

# Sentiment analysis model
sa_model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased").to(device)
sa_tokenizer = AutoTokenizer.from_pretrained("savasy/bert-base-turkish-sentiment-cased")

# Sentiment analysis pipeline
sa = pipeline("sentiment-analysis", tokenizer=sa_tokenizer, model=sa_model, device=0 if torch.cuda.is_available() else -1)

def analyze_sentiment(text):
    # Metni tokenize edip aynı cihaza taşıyoruz
    inputs = sa_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = sa_model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    score, predicted_label = torch.max(probabilities, dim=-1)

    sentiment = "olumlu" if predicted_label.item() == 1 else "olumsuz"
    return sentiment, score.item()

def extract_entities(text):
    inputs = ner_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = ner_model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2).squeeze().tolist()
    tokens = ner_tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())

    entities = {'PER': [], 'ORG': [], 'LOC': [], 'MISC': []}
    current_entity = ""
    current_type = ""

    for token, pred in zip(tokens, predictions):
        label = ner_model.config.id2label[pred]

        # Gereksiz tokenleri filtreleme
        if token in ['</s>', '<pad>', '<s>', '.com', '.net', '.org']:
            continue

        # Subword tokens are usually prefixed with "▁", remove this prefix
        token = token.replace('▁', '')

        if label.startswith('B-') or label.startswith('I-'):
            entity_type = label.split('-')[1]
            if label.startswith('B-') and current_entity:
                # Aynı entity'yi iki kere eklememek için kontrol
                if current_entity.strip() not in entities[current_type]:
                    entities[current_type].append(current_entity.strip())
                current_entity = ""
            current_entity += token + ""
            current_type = entity_type
        elif current_entity:
            # Aynı entity'yi iki kere eklememek için kontrol
            if current_entity.strip() not in entities[current_type]:
                entities[current_type].append(current_entity.strip())
            current_entity = ""
            current_type = ""

    if current_entity and current_entity.strip() not in entities[current_type]:
        entities[current_type].append(current_entity.strip())

    # ".com", ".net" gibi uzantılarla biten entity'leri birleştirme
    for key in entities:
        merged_entities = []
        i = 0
        while i < len(entities[key]):
            if i < len(entities[key]) - 1 and entities[key][i+1] in ['com', 'net', 'org']:
                merged_entities.append(entities[key][i] + '.' + entities[key][i+1])
                i += 2
            else:
                merged_entities.append(entities[key][i])
                i += 1
        entities[key] = list(set(merged_entities))  # Aynı entity'yi iki kez yazmamak için set kullanımı

    return entities


def get_window(text, entity, window_size=50):
    words = text.split()
    entity_words = entity.split()
    for i in range(len(words) - len(entity_words) + 1):
        if words[i:i+len(entity_words)] == entity_words:
            entity_index = i + len(entity_words) - 1
            start = max(0, entity_index - window_size)
            end = min(len(words), entity_index + window_size + 1)
            return ' '.join(words[start:end])
    return text

def analyze_text_method1(text, actual_label):
    entities = extract_entities(text)
    overall_sentiment, overall_score = analyze_sentiment(text)

    results = []
    for entity_type, entity_list in entities.items():
        for entity in entity_list:
            predicted_sentiment = "olumlu" if actual_label == 'olumlu' else "olumsuz"
            results.append({
                "entity": entity,
                "type": entity_type,
                "sentiment": predicted_sentiment,
                "score": overall_score
            })

    return {
        "entity_list": entities,
        "results": results
    }

def analyze_text_method2(text, actual_label):
    entities = extract_entities(text)

    results = []
    for entity_type, entity_list in entities.items():
        for entity in entity_list:
            window = get_window(text, entity)
            predicted_sentiment = "olumlu" if actual_label == 'olumlu' else "olumsuz"
            sentiment, score = analyze_sentiment(window)
            results.append({
                "entity": entity,
                "type": entity_type,
                "sentiment": predicted_sentiment,
                "score": score
            })

    return {
        "entity_list": entities,
        "results": results
    }

def analyze_text_method3(text, actual_label):
    entities = extract_entities(text)
    overall_sentiment, overall_score = analyze_sentiment(text)

    results = []
    for entity_type, entity_list in entities.items():
        for entity in entity_list:
            sentences = text.split('.')
            entity_sentence = next((s for s in sentences if entity in s), text)
            sentence_sentiment, sentence_score = analyze_sentiment(entity_sentence)

            weighted_score = 0.7 * sentence_score + 0.3 * overall_score
            weighted_sentiment = "olumlu" if weighted_score > 0.5 else "olumsuz"
            predicted_sentiment = "olumlu" if actual_label == 'olumlu' else "olumsuz"

            results.append({
                "entity": entity,
                "type": entity_type,
                "sentiment": predicted_sentiment,
                "score": weighted_score,
                "sentence_sentiment": sentence_sentiment,
                "overall_sentiment": overall_sentiment
            })

    return {
        "entity_list": entities,
        "results": results
    }


def process_csv(input_csv, output_csv):
    df = pd.read_csv(input_csv)

    results = []

    for _, row in df.iterrows():
        text = row['text'].lower()
        actual_label = row['label']

        result1 = analyze_text_method1(text, actual_label)

        if not any(result1['entity_list'].values()):
            continue

        # Entity list'i ve sonuçları formatlıyoruz
        entity_list_str = str([entity for entities in result1['entity_list'].values() for entity in entities])
        results_str = str([{"entity": result['entity'], "sentiment": result['sentiment']} for result in result1['results']])

        results.append({
            "input_text": text,
            "entity_list": entity_list_str,
            "results": results_str
        })

    results_df = pd.DataFrame(results)
    results_df.to_csv(output_csv, index=False)

if __name__ == "__main__":
    dosya = "veri400klabelkisa"
    input_csv = f'/content/MyDrive/MyDrive/dilsileme/{dosya}.csv'
    output_csv = f'/content/MyDrive/MyDrive/dilsileme/{dosya}_results.csv'
    process_csv(input_csv, output_csv)



Drive already mounted at /content/MyDrive; to attempt to forcibly remount, call drive.mount("/content/MyDrive", force_remount=True).


Some weights of the model checkpoint at FacebookAI/xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
