In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import ast # Metin verilerini uygun formata dönüştürmek için

# Veri setlerini diskten yükle
# Dosya yollarının doğru olduğundan emin olunmalıdır.
df_lemmatized = pd.read_csv("C:/Users/yunus/Desktop/emine_proje/data/preprocessed_data_lemmatized_only.csv")
df_stemmed = pd.read_csv("C:/Users/yunus/Desktop/emine_proje/data/preprocessed_data_stemmed_only.csv")

# Metin sütunlarındaki string formatındaki listeleri Python listelerine dönüştür
df_lemmatized['description_processed'] = df_lemmatized['description_processed'].apply(ast.literal_eval)
df_stemmed['description_stemmed'] = df_stemmed['description_stemmed'].apply(ast.literal_eval)

# TF-IDF vektörleştirme için, metinleri tek bir dizeye dönüştür
df_lemmatized['description_processed_flat'] = df_lemmatized['description_processed'].apply(lambda x: ' '.join([' '.join(s) for s in x]))
df_stemmed['description_stemmed_flat'] = df_stemmed['description_stemmed'].apply(lambda x: ' '.join([' '.join(s) for s in x]))

print("Metin veri setleri başarıyla yüklendi ve işlendi.")
print("Lemmatize edilmiş veri örneği (ilk 2 satır):")
print(df_lemmatized[['description', 'description_processed_flat']].head(2))
print("\nStemlenmiş veri örneği (ilk 2 satır):")
print(df_stemmed[['description', 'description_stemmed_flat']].head(2))

Metin veri setleri başarıyla yüklendi ve işlendi.
Lemmatize edilmiş veri örneği (ilk 2 satır):
                                         description  \
0  "I wrote it. I didn't fail. It was straight,"�...   
1  "It's the riff heard round the world," says St...   

                          description_processed_flat  
0  i wrote it i didnt fail it wa straightbob dyla...  
1  it the riff heard round the world say steve va...  

Stemlenmiş veri örneği (ilk 2 satır):
                                         description  \
0  "I wrote it. I didn't fail. It was straight,"�...   
1  "It's the riff heard round the world," says St...   

                            description_stemmed_flat  
0  wrote didnt fail straightbob dylansaid greates...  
1  riff heard round world say steve van zandt gui...  


In [2]:
print("\n--- TF-IDF Benzerlik Hesaplamaları Başlatılıyor ---")

# TF-IDF vektörleştiricilerini veri üzerinde eğit
tfidf_vectorizer_lemma = TfidfVectorizer()
tfidf_matrix_lemma = tfidf_vectorizer_lemma.fit_transform(df_lemmatized['description_processed_flat'])

tfidf_vectorizer_stem = TfidfVectorizer()
tfidf_matrix_stem = tfidf_vectorizer_stem.fit_transform(df_stemmed['description_stemmed_flat'])

# Karşılaştırma için veri setinden bir giriş metni seç (örneğin ilk doküman) [cite: 11]
input_text_index = 0
input_original_text = df_lemmatized['description'].iloc[input_text_index] # Orijinal metin

print(f"\nSeçilen Giriş Metni (Index {input_text_index}):\n\"{input_original_text}\"")

def calculate_top_similar_tfidf(input_vector, tfidf_matrix, df_source, top_n=5, exclude_idx=None):
    """
    TF-IDF vektörleri kullanarak bir giriş metnine en benzer ilk N metni belirler.
    """
    cosine_similarities = cosine_similarity(input_vector, tfidf_matrix).flatten()
    
    # Benzerlik skorlarına göre sırala
    if exclude_idx is not None:
        # Giriş metnini sonuçlardan hariç tut
        similar_indices = cosine_similarities.argsort()[:-top_n-2:-1] 
        similar_indices = [idx for idx in similar_indices if idx != exclude_idx][:top_n]
    else:
        similar_indices = cosine_similarities.argsort()[:-top_n-1:-1]
        
    similar_texts_with_scores = []
    for idx in similar_indices:
        similar_texts_with_scores.append({
            'document_id': df_source.index[idx], 
            'content': df_source['description'].iloc[idx], 
            'similarity_score': cosine_similarities[idx]
        })
    return similar_texts_with_scores

# Lemmatize edilmiş TF-IDF için benzerlikleri hesapla
input_vector_lemma = tfidf_matrix_lemma[input_text_index:input_text_index+1]
top_similar_tfidf_lemma = calculate_top_similar_tfidf(input_vector_lemma, tfidf_matrix_lemma, df_lemmatized, exclude_idx=input_text_index)

print("\n--- TF-IDF Lemmatize Edilmiş Sonuçlar ---")
for i, text_info in enumerate(top_similar_tfidf_lemma):
    print(f" {i+1}. Doküman ID: {text_info['document_id']}, Skor: {text_info['similarity_score']:.4f}, Metin: {text_info['content']}")

# Stemlenmiş TF-IDF için benzerlikleri hesapla
input_vector_stem = tfidf_matrix_stem[input_text_index:input_text_index+1]
top_similar_tfidf_stem = calculate_top_similar_tfidf(input_vector_stem, tfidf_matrix_stem, df_stemmed, exclude_idx=input_text_index)

print("\n--- TF-IDF Stemlenmiş Sonuçlar ---")
for i, text_info in enumerate(top_similar_tfidf_stem):
    print(f" {i+1}. Doküman ID: {text_info['document_id']}, Skor: {text_info['similarity_score']:.4f}, Metin: {text_info['content']}")


--- TF-IDF Benzerlik Hesaplamaları Başlatılıyor ---

Seçilen Giriş Metni (Index 0):
""I wrote it. I didn't fail. It was straight,"�Bob Dylan�said of his greatest song shortly after he recorded it in June 1965. There is no better description of "Like a Rolling Stone" � of its revolutionary design and execution � or of the young man, just turned 24, who created it."

--- TF-IDF Lemmatize Edilmiş Sonuçlar ---
 1. Doküman ID: 234, Skor: 0.1581, Metin: "Don't feel like Satan/But I am to them," Young spat in this raucously ambivalent song about the pride and guilt of being an American. It was inspired by a remark from a member of Crazy Horse, who said gigs were safer in Europe than in the Middle East: "It's better to keep rockin' in the free world." "It was such a cliché," Young said. "I knew I had to use it."
 2. Doküman ID: 28, Skor: 0.1551, Metin: "Most people think it's just a fast rock & roll song," Lennon said. "Subconsciously, I was crying out for help. I didn't realize it at the tim

In [3]:
print("\n--- Word2Vec Benzerlik Hesaplamaları Başlatılıyor ---")

def load_word2vec_model(model_path):
    """Belirtilen yoldan bir Word2Vec modeli yükler."""
    try:
        model = Word2Vec.load(model_path)
        return model
    except Exception as e:
        print(f"Model yüklenirken hata oluştu: {model_path} - {e}")
        return None

def get_document_vector(document_sentences, model):
    """Bir belge (cümle listesi) için ortalama Word2Vec vektörünü hesaplar.
       Modelin kelime haznesinde bulunmayan kelimeler atlanır. [cite: 30]"""
    vectors = []
    for sentence in document_sentences:
        for word in sentence:
            if word in model.wv:
                vectors.append(model.wv[word])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return None 

def calculate_top_similar_word2vec(input_document_vector, corpus_vectors, df_source, top_n=5, exclude_idx=None):
    """
    Word2Vec vektörleri kullanarak bir giriş belgesine en benzer ilk N belgeyi bulur.
    """
    if input_document_vector is None:
        return []

    similarities = []
    for idx, doc_vector in enumerate(corpus_vectors):
        if doc_vector is not None:
            sim = cosine_similarity(input_document_vector.reshape(1, -1), doc_vector.reshape(1, -1))[0][0]
            similarities.append((idx, sim))
        else:
            similarities.append((idx, -1)) # Vektör oluşturulamayan belgeler için düşük skor

    similarities.sort(key=lambda x: x[1], reverse=True)
    
    top_n_results = []
    for idx, score in similarities:
        if idx != exclude_idx and score != -1: 
            top_n_results.append({
                'document_id': df_source.index[idx],
                'content': df_source['description'].iloc[idx],
                'similarity_score': score
            })
        if len(top_n_results) == top_n:
            break
            
    return top_n_results

# Word2Vec model parametreleri [cite: 7]
model_parameters = [
    {'model_type': 'cbow', 'window': 2, 'vector_size': 100},
    {'model_type': 'skipgram', 'window': 2, 'vector_size': 100},
    {'model_type': 'cbow', 'window': 4, 'vector_size': 100},
    {'model_type': 'skipgram', 'window': 4, 'vector_size': 100},
    {'model_type': 'cbow', 'window': 2, 'vector_size': 300},
    {'model_type': 'skipgram', 'window': 2, 'vector_size': 300},
    {'model_type': 'cbow', 'window': 4, 'vector_size': 300},
    {'model_type': 'skipgram', 'window': 4, 'vector_size': 300}
]

all_model_similarity_results = {} 

# Lemmatize edilmiş modeller için benzerlikleri hesapla
for param in model_parameters:
    model_name = f"lemmatized_model_{param['model_type']}_vs{param['vector_size']}_w{param['window']}"
    model_path = f"C:/Users/yunus/Desktop/emine_proje/data/{model_name}.model"
    current_model = load_word2vec_model(model_path)
    
    if current_model:
        input_document_tokens = df_lemmatized['description_processed'].iloc[input_text_index]
        input_document_vector = get_document_vector(input_document_tokens, current_model)
        
        if input_document_vector is not None:
            corpus_vectors = [get_document_vector(doc_tokens, current_model) for doc_tokens in df_lemmatized['description_processed']]
            top_similar_docs = calculate_top_similar_word2vec(input_document_vector, corpus_vectors, df_lemmatized, exclude_idx=input_text_index)
            all_model_similarity_results[model_name] = top_similar_docs
            
            print(f"\n--- Word2Vec Lemmatize Edilmiş Model: {model_name} Sonuçları ---")
            for i, text_info in enumerate(top_similar_docs):
                print(f" {i+1}. Doküman ID: {text_info['document_id']}, Skor: {text_info['similarity_score']:.4f}, Metin: {text_info['content']}")
        else:
            print(f"\n{model_name}: Giriş belgesinde modelin kelime haznesinde kelime bulunamadı. Benzerlik hesaplaması atlandı.")

# Stemlenmiş modeller için benzerlikleri hesapla
for param in model_parameters:
    model_name = f"stemmed_model_{param['model_type']}_vs{param['vector_size']}_w{param['window']}"
    model_path = f"C:/Users/yunus/Desktop/emine_proje/data/{model_name}.model"
    current_model = load_word2vec_model(model_path)
    
    if current_model:
        input_document_tokens = df_stemmed['description_stemmed'].iloc[input_text_index]
        input_document_vector = get_document_vector(input_document_tokens, current_model)
        
        if input_document_vector is not None:
            corpus_vectors = [get_document_vector(doc_tokens, current_model) for doc_tokens in df_stemmed['description_stemmed']]
            top_similar_docs = calculate_top_similar_word2vec(input_document_vector, corpus_vectors, df_stemmed, exclude_idx=input_text_index)
            all_model_similarity_results[model_name] = top_similar_docs
            
            print(f"\n--- Word2Vec Stemlenmiş Model: {model_name} Sonuçları ---")
            for i, text_info in enumerate(top_similar_docs):
                print(f" {i+1}. Doküman ID: {text_info['document_id']}, Skor: {text_info['similarity_score']:.4f}, Metin: {text_info['content']}")
        else:
            print(f"\n{model_name}: Giriş belgesinde modelin kelime haznesinde kelime bulunamadı. Benzerlik hesaplaması atlandı.")


--- Word2Vec Benzerlik Hesaplamaları Başlatılıyor ---

--- Word2Vec Lemmatize Edilmiş Model: lemmatized_model_cbow_vs100_w2 Sonuçları ---
 1. Doküman ID: 234, Skor: 1.0000, Metin: "Don't feel like Satan/But I am to them," Young spat in this raucously ambivalent song about the pride and guilt of being an American. It was inspired by a remark from a member of Crazy Horse, who said gigs were safer in Europe than in the Middle East: "It's better to keep rockin' in the free world." "It was such a cliché," Young said. "I knew I had to use it."
 2. Doküman ID: 254, Skor: 1.0000, Metin: "Stan" was Eminem’s scariest song, because for once the horror seemed real. Anchored by a sample from Dido’s "Thank You" (which became a hit itself), it followed an obsessed fan who acts out Em's fantasies. "He’s crazy for real, and he thinks I’m crazy, but I try to help him at the end of the song," said Eminem. "It kinda shows the real side of me."
 3. Doküman ID: 57, Skor: 1.0000, Metin: "The music that real

In [9]:
# --- Jaccard Benzerlik Hesaplamaları ---

def jaccard_similarity(list_a, list_b):
    """İki liste arasındaki Jaccard benzerliğini hesaplar."""
    set_a = set(list_a)
    set_b = set(list_b)
    intersection = len(set_a.intersection(set_b))
    union = len(set_a.union(set_b))
    if union == 0:
        return 0.0
    return intersection / union

# TF-IDF sonuçlarını all_model_similarity_results sözlüğüne ekle (sadece doküman ID'lerini alarak)
# Bu kontrol, kod bloğunun tekrar tekrar çalıştırılması durumunda mükerrer eklemeyi önler.
if "TF-IDF Lemmatized" not in all_model_similarity_results:
    all_model_similarity_results["TF-IDF Lemmatized"] = [d['document_id'] for d in top_similar_tfidf_lemma]
if "TF-IDF Stemmed" not in all_model_similarity_results:
    all_model_similarity_results["TF-IDF Stemmed"] = [d['document_id'] for d in top_similar_tfidf_stem]

# Tüm modellerin isimlerini al
model_names_for_jaccard = list(all_model_similarity_results.keys())

# Jaccard benzerlik matrisini oluştur
jaccard_matrix = pd.DataFrame(0.0, index=model_names_for_jaccard, columns=model_names_for_jaccard)

for i in range(len(model_names_for_jaccard)):
    for j in range(len(model_names_for_jaccard)):
        model1_name = model_names_for_jaccard[i]
        model2_name = model_names_for_jaccard[j]

        # Sonuçların formatını kontrol et ve 'document_id'leri içeren bir liste al
        results1_raw = all_model_similarity_results[model1_name]
        results2_raw = all_model_similarity_results[model2_name]

        # Eğer sonuçlar bir sözlük listesiyse ('document_id', 'content', 'similarity_score' içeren),
        # sadece 'document_id'leri çıkar. Aksi takdirde, listenin kendisini kullan (TF-IDF gibi).
        results1 = [item['document_id'] for item in results1_raw] if results1_raw and isinstance(results1_raw[0], dict) else results1_raw
        results2 = [item['document_id'] for item in results2_raw] if results2_raw and isinstance(results2_raw[0], dict) else results2_raw

        sim_score = jaccard_similarity(results1, results2)
        jaccard_matrix.loc[model1_name, model2_name] = sim_score

print("\n--- Jaccard Benzerlik Matrisi (18x18) ---")
print(jaccard_matrix)


--- Jaccard Benzerlik Matrisi (18x18) ---
                                    lemmatized_model_cbow_vs100_w2  \
lemmatized_model_cbow_vs100_w2                            1.000000   
lemmatized_model_skipgram_vs100_w2                        0.666667   
lemmatized_model_cbow_vs100_w4                            0.666667   
lemmatized_model_skipgram_vs100_w4                        0.111111   
lemmatized_model_cbow_vs300_w2                            0.428571   
lemmatized_model_skipgram_vs300_w2                        0.250000   
lemmatized_model_cbow_vs300_w4                            0.428571   
lemmatized_model_skipgram_vs300_w4                        0.428571   
stemmed_model_cbow_vs100_w2                               0.250000   
stemmed_model_skipgram_vs100_w2                           0.111111   
stemmed_model_cbow_vs100_w4                               0.250000   
stemmed_model_skipgram_vs100_w4                           0.000000   
stemmed_model_cbow_vs300_w2                    