In [7]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import Counter
import re

# NLTK verilerini indir (bir kez √ßalƒ±≈ütƒ±rƒ±lmasƒ± yeterli)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Veriyi oku
data = pd.read_csv('C:/Users/eren/Desktop/sƒ±kƒ±ldƒ±m1/data/veri_5k.csv')

# Metin √∂n i≈üleme fonksiyonu
def preprocess_text(text):
    # K√º√ß√ºk harfe √ßevir
    text = text.lower()
    # √ñzel karakterleri ve sayƒ±larƒ± kaldƒ±r
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize et
    tokens = word_tokenize(text)
    # Stop kelimeleri kaldƒ±r
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize et
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# Yorumlarƒ± √∂n i≈üle
data['processed_review'] = data['review'].apply(preprocess_text)

# TF-IDF vekt√∂rle≈ütirici
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['processed_review'])

# Cosine benzerliƒüini hesapla
cosine_sim = cosine_similarity(tfidf_matrix)

# Benzerlik e≈üiƒüi
threshold = 0.8

# Gruplarƒ± olu≈ütur
groups = []
visited = set()

for i in range(len(data)):
    if i not in visited:
        group = [i]
        for j in range(i + 1, len(data)):
            if cosine_sim[i][j] > threshold:
                group.append(j)
                visited.add(j)
        if len(group) > 1:  # Yalnƒ±zca birden fazla elemanƒ± olan gruplarƒ± ekle
            groups.append(group)
        visited.add(i)

# Her gruba ba≈ülƒ±k ata
def get_group_title(group_indices):
    # Grup i√ßindeki t√ºm yorumlarƒ± birle≈ütir
    group_texts = ' '.join(data.iloc[group_indices]['processed_review'])
    # Kelimeleri say
    words = group_texts.split()
    # Stop kelimeleri √ßƒ±kar
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # En sƒ±k ge√ßen kelimeyi bul
    word_counts = Counter(words)
    if word_counts:
        return word_counts.most_common(1)[0][0]
    return "Unknown"

# Sonu√ßlarƒ± hazƒ±rla
group_results = []
for idx, group in enumerate(groups):
    group_comments = data.iloc[group]['review'].tolist()
    group_title = get_group_title(group)
    group_results.append({
        'Group': idx + 1,
        'Title': group_title,
        'Comments': group_comments
    })

# Sonu√ßlarƒ± yazdƒ±r
for group in group_results:
    print(f"\nGroup {group['Group']} - Title: {group['Title']}")
    print("Comments:")
    for comment in group['Comments']:
        print(f"- {comment}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eren\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eren\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eren\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Group 1 - Title: emperor
Comments:
- FOR THE EMPEROR
- FOR THE EMPEROR
- FOR THE EMPEROR
- FOR THE EMPEROR‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚¢Ä‚†®‚†≠‚¢≠‚£≠‚¢≠‚°•‚†≠‚†ï‚°Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚£Ä‚†Ñ‚†Ç‚†â‚¢∑‚£†‚£º‚°õ‚£ò‚£∑‚£Ä‚°∂‚†â‚†ê‚††‚°Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚¢Ä‚†¨‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚¢ß‚°Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚¢Ä‚†á‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚¢∞‚°Ä‚†Ä‚†Ä‚†Ä‚†Ä‚°å‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚¢±‚†Ä‚†Ä‚†Ä‚†Ä‚†É‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†∏‚†Ä‚†Ä‚°î‚†ä‚°Ä‚†Ä‚¢∞‚°§‚†Ñ‚£Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚†Ä‚£Ä‚††‚†§‚°Ü‚†Ä‚¢à‚†ê‚¢¢‚°á‚†Ä‚¢á‚£Ä‚†Ä‚†à‚¢Ü‚†Ä‚†à‚†Å‚†í‚°†‚†§‚†§‚¢Ñ‚†í‚†à‚†Å‚†Ä‚°†‚†Å‚¢É‚£Ñ‚°∏‚†Ä‚¢∏‚†É‚†Ä‚°Ü‚†Ä‚†Ä‚†Ä‚†Ñ‚†à‚†ê‚†í‚†à‚†Ä‚°§‚¢§‚†Ä‚†Å‚†í‚†Ç‚¢Å‚††‚†Ç‚†Ä‚†Ä‚¢±‚†Ä‚†∏‚†Ä‚¢Ä‚£ó‚£≤‚¢∑‚†Ä‚†Ä‚†Ä‚†â‚¢Ä‚°Ç‚£§‚°í‚£í‚£Ñ‚††‚°Ñ‚†â‚†Ä‚†Ä‚†Ä‚°∂‚£ñ‚°∫‚°Ä‚†à‚°Ä‚°Å‚°è‚£á‚¢ø‚†Ä‚†à‚†Ä‚†∞‚¢Å‚¢∑‚°á‚°∑‚£ø‚¢∏‚°∑‚°à‚†Ü‚†Ä‚†Å‚†Ä‚°ø‚£ø‚¢ø‚†à‚¢Ä‚†ò‚†ß‚°ß‚†¥‚£∏‚†Ä‚†