In [3]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from scipy.sparse import load_npz
from itertools import combinations
import pickle

# NLTK verilerini indir
nltk.download('punkt')
nltk.download('stopwords')

# Tokenizasyon fonksiyonu
def proper_tokenize(text):
    text = re.sub(r'[^a-zA-ZğüşıöçĞÜŞİÖÇ\s]', '', text.lower())
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words and len(word) > 1]

# Veri setlerini yükle
df_lemmatized = pd.read_csv("C:/Users/eren/Desktop/sıkıldım1/data/lemmatized_sentences.csv")
df_stemmed = pd.read_csv("C:/Users/eren/Desktop/sıkıldım1/data/stemmed_sentences.csv")

# Sütun adlarını düzelt ve temizle
df_lemmatized.columns = ["document_id", "content"]
df_lemmatized = df_lemmatized.dropna()
df_lemmatized = df_lemmatized[df_lemmatized["content"].str.strip() != ""]
df_lemmatized['tokens'] = df_lemmatized['content'].apply(proper_tokenize)

df_stemmed.columns = ["document_id", "content"]
df_stemmed = df_stemmed.dropna()
df_stemmed = df_stemmed[df_stemmed["content"].str.strip() != ""]
df_stemmed['tokens'] = df_stemmed['content'].apply(proper_tokenize)

# Giriş metnini seç (veri setinden rastgele bir yorum)
input_text = df_lemmatized['content'].iloc[0]  # Ör. "This game is so fun and engaging for solo players"
input_tokens = proper_tokenize(input_text)

# TF-IDF modellerini ve vektörlerini yükle
tfidf_lemmatized_matrix = load_npz("tfidf_lemmatized.npz")  # TF-IDF vektör matrisi
tfidf_stemmed_matrix = load_npz("tfidf_stemmed.npz")
with open("tfidf_lemmatized_model.pkl", "rb") as f:
    tfidf_lemmatized = pickle.load(f)
with open("tfidf_stemmed_model.pkl", "rb") as f:
    tfidf_stemmed = pickle.load(f)

# Word2Vec modellerini yükle
word2vec_models = {
    "lemmatized_cbow_vs100_w2": Word2Vec.load("lemmatized_model_cbow_vs100_w2.model"),
    "lemmatized_cbow_vs100_w4": Word2Vec.load("lemmatized_model_cbow_vs100_w4.model"),
    "lemmatized_cbow_vs300_w2": Word2Vec.load("lemmatized_model_cbow_vs300_w2.model"),
    "lemmatized_cbow_vs300_w4": Word2Vec.load("lemmatized_model_cbow_vs300_w4.model"),
    "lemmatized_skipgram_vs100_w2": Word2Vec.load("lemmatized_model_skipgram_vs100_w2.model"),
    "lemmatized_skipgram_vs100_w4": Word2Vec.load("lemmatized_model_skipgram_vs100_w4.model"),
    "lemmatized_skipgram_vs300_w2": Word2Vec.load("lemmatized_model_skipgram_vs300_w2.model"),
    "lemmatized_skipgram_vs300_w4": Word2Vec.load("lemmatized_model_skipgram_vs300_w4.model"),
    "stemmed_cbow_vs100_w2": Word2Vec.load("stemmed_model_cbow_vs100_w2.model"),
    "stemmed_cbow_vs100_w4": Word2Vec.load("stemmed_model_cbow_vs100_w4.model"),
    "stemmed_cbow_vs300_w2": Word2Vec.load("stemmed_model_cbow_vs300_w2.model"),
    "stemmed_cbow_vs300_w4": Word2Vec.load("stemmed_model_cbow_vs300_w4.model"),
    "stemmed_skipgram_vs100_w2": Word2Vec.load("stemmed_model_skipgram_vs100_w2.model"),
    "stemmed_skipgram_vs100_w4": Word2Vec.load("stemmed_model_skipgram_vs100_w4.model"),
    "stemmed_skipgram_vs300_w2": Word2Vec.load("stemmed_model_skipgram_vs300_w2.model"),
    "stemmed_skipgram_vs300_w4": Word2Vec.load("stemmed_model_skipgram_vs300_w4.model")
}

# Cümle vektörü oluştur
def get_sentence_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# TF-IDF için giriş metni vektörü
def get_tfidf_vector(text, tfidf_model, tfidf_vectorizer):
    return tfidf_vectorizer.transform([text]).toarray()

# Benzerlik hesaplama fonksiyonu
def find_top_5_similar(model_name, input_vector, vectors, df, is_tfidf=False):
    if is_tfidf:
        similarities = cosine_similarity(input_vector, vectors)[0]
    else:
        similarities = cosine_similarity([input_vector], vectors)[0]
    top_5_indices = np.argsort(similarities)[-5:][::-1]
    results = []
    for idx in top_5_indices:
        results.append({
            "document_id": df['document_id'].iloc[idx],
            "content": df['content'].iloc[idx],
            "similarity_score": similarities[idx]
        })
    return results

# Tüm modeller için sonuçları topla
all_results = {}

# TF-IDF Benzerlik
input_tfidf_lemmatized = get_tfidf_vector(input_text, tfidf_lemmatized, tfidf_lemmatized)
all_results["tfidf_lemmatized"] = find_top_5_similar("tfidf_lemmatized", input_tfidf_lemmatized, tfidf_lemmatized_matrix, df_lemmatized, is_tfidf=True)

input_tfidf_stemmed = get_tfidf_vector(input_text, tfidf_stemmed, tfidf_stemmed)
all_results["tfidf_stemmed"] = find_top_5_similar("tfidf_stemmed", input_tfidf_stemmed, tfidf_stemmed_matrix, df_stemmed, is_tfidf=True)

# Word2Vec Benzerlik
for model_name, model in word2vec_models.items():
    # Veri setine göre doğru df seç
    df = df_lemmatized if "lemmatized" in model_name else df_stemmed
    # Cümle vektörlerini hazırla
    sentence_vectors = [get_sentence_vector(tokens, model) for tokens in df['tokens']]
    # Giriş metni vektörü
    input_vector = get_sentence_vector(input_tokens, model)
    all_results[model_name] = find_top_5_similar(model_name, input_vector, sentence_vectors, df)

# Anlamsal Değerlendirme (Örnek puanlar, senin incelemen gerekecek)
semantic_scores = {}
for model_name, results in all_results.items():
    # Örnek puanlar (1-5), gerçekte senin değerlendirmen lazım
    scores = [4, 3, 3, 2, 1]  # Placeholder, her metni okuyup puanla
    semantic_scores[model_name] = {
        "documents": [r["document_id"] for r in results],
        "scores": scores,
        "average_score": np.mean(scores)
    }

# Jaccard Benzerliği
def jaccard_similarity(set1, set2):
    set1 = set(set1)
    set2 = set(set2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union > 0 else 0

model_names = list(all_results.keys())
jaccard_matrix = np.zeros((len(model_names), len(model_names)))
for i, j in combinations(range(len(model_names)), 2):
    docs_i = [r["document_id"] for r in all_results[model_names[i]]]
    docs_j = [r["document_id"] for r in all_results[model_names[j]]]
    jaccard_score = jaccard_similarity(docs_i, docs_j)
    jaccard_matrix[i, j] = jaccard_score
    jaccard_matrix[j, i] = jaccard_score

# Köşegenleri 1 yap
np.fill_diagonal(jaccard_matrix, 1)

# Sonuçları yazdır
print(f"Giriş Metni: {input_text}\n")
for model_name, results in all_results.items():
    print(f"Model: {model_name}")
    print("En Benzer 5 Metin:")
    for r in results:
        print(f"ID: {r['document_id']}, Skor: {r['similarity_score']:.4f}, Metin: {r['content']}")
    print(f"Anlamsal Skorlar: {semantic_scores[model_name]['scores']}, Ortalama: {semantic_scores[model_name]['average_score']:.2f}\n")

print("Jaccard Benzerlik Matrisi:")
print(pd.DataFrame(jaccard_matrix, index=model_names, columns=model_names).round(2))

# Sonuçları CSV'ye kaydet
results_df = []
for model_name, results in all_results.items():
    for r in results:
        results_df.append({
            "Model": model_name,
            "Document_ID": r["document_id"],
            "Content": r["content"],
            "Similarity_Score": r["similarity_score"],
            "Semantic_Score": semantic_scores[model_name]["scores"][results.index(r)]
        })
pd.DataFrame(results_df).to_csv("similarity_results.csv", index=False)
pd.DataFrame(jaccard_matrix, index=model_names, columns=model_names).to_csv("jaccard_matrix.csv")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eren\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eren\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ValueError: Length mismatch: Expected axis has 1 elements, new values have 2 elements