In [7]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import Counter
import re

# NLTK verilerini indir (bir kez çalıştırılması yeterli)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Veriyi oku
data = pd.read_csv('C:/Users/eren/Desktop/sıkıldım1/data/veri_5k.csv')

# Metin ön işleme fonksiyonu
def preprocess_text(text):
    # Küçük harfe çevir
    text = text.lower()
    # Özel karakterleri ve sayıları kaldır
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize et
    tokens = word_tokenize(text)
    # Stop kelimeleri kaldır
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize et
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# Yorumları ön işle
data['processed_review'] = data['review'].apply(preprocess_text)

# TF-IDF vektörleştirici
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['processed_review'])

# Cosine benzerliğini hesapla
cosine_sim = cosine_similarity(tfidf_matrix)

# Benzerlik eşiği
threshold = 0.8

# Grupları oluştur
groups = []
visited = set()

for i in range(len(data)):
    if i not in visited:
        group = [i]
        for j in range(i + 1, len(data)):
            if cosine_sim[i][j] > threshold:
                group.append(j)
                visited.add(j)
        if len(group) > 1:  # Yalnızca birden fazla elemanı olan grupları ekle
            groups.append(group)
        visited.add(i)

# Her gruba başlık ata
def get_group_title(group_indices):
    # Grup içindeki tüm yorumları birleştir
    group_texts = ' '.join(data.iloc[group_indices]['processed_review'])
    # Kelimeleri say
    words = group_texts.split()
    # Stop kelimeleri çıkar
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # En sık geçen kelimeyi bul
    word_counts = Counter(words)
    if word_counts:
        return word_counts.most_common(1)[0][0]
    return "Unknown"

# Sonuçları hazırla
group_results = []
for idx, group in enumerate(groups):
    group_comments = data.iloc[group]['review'].tolist()
    group_title = get_group_title(group)
    group_results.append({
        'Group': idx + 1,
        'Title': group_title,
        'Comments': group_comments
    })

# Sonuçları yazdır
for group in group_results:
    print(f"\nGroup {group['Group']} - Title: {group['Title']}")
    print("Comments:")
    for comment in group['Comments']:
        print(f"- {comment}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eren\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eren\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eren\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Group 1 - Title: emperor
Comments:
- FOR THE EMPEROR
- FOR THE EMPEROR
- FOR THE EMPEROR
- FOR THE EMPEROR⠀⠀⠀⠀⠀⠀⠀⠀⢀⠨⠭⢭⣭⢭⡥⠭⠕⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⠄⠂⠉⢷⣠⣼⡛⣘⣷⣀⡶⠉⠐⠠⡀⠀⠀⠀⠀⠀⠀⠀⠀⢀⠬⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢧⡀⠀⠀⠀⠀⠀⢀⠇⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢰⡀⠀⠀⠀⠀⡌⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢱⠀⠀⠀⠀⠃⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠸⠀⠀⡔⠊⡀⠀⢰⡤⠄⣀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⠠⠤⡆⠀⢈⠐⢢⡇⠀⢇⣀⠀⠈⢆⠀⠈⠁⠒⡠⠤⠤⢄⠒⠈⠁⠀⡠⠁⢃⣄⡸⠀⢸⠃⠀⡆⠀⠀⠀⠄⠈⠐⠒⠈⠀⡤⢤⠀⠁⠒⠂⢁⠠⠂⠀⠀⢱⠀⠸⠀⢀⣗⣲⢷⠀⠀⠀⠉⢀⡂⣤⡒⣒⣄⠠⡄⠉⠀⠀⠀⡶⣖⡺⡀⠈⡀⡁⡏⣇⢿⠀⠈⠀⠰⢁⢷⡇⡷⣿⢸⡷⡈⠆⠀⠁⠀⡿⣿⢿⠈⢀⠘⠧⡧⠴⣸⠀⠂⠀⡎⣇⡞⡏⣇⣼⢹⢲⢱⢓⠀⠀⠀⣇⠦⢼⠰⠂⠀⠈⢏⡔⠈⠀⢏⢿⠁⢿⡇⣇⡿⣿⣼⢸⣿⠈⡝⠙⠐⠁⢪⡚⠁⠀⠀⠀⠀⠑⠚⠀⠒⠙⠦⠜⠷⣏⣃⣛⣹⠾⠣⠴⠋⠒⠀⠐⠊⠀⠀⠀
- FOR THE EMPEROR!!!
- FOR THE EMPEROR!!!
- FOR THE EMPEROR!
- For the emperor
- For the emperor!
- FOR THE EMPEROR
- FOR THE EMPEROR
- For the Emperor!
- For The Emperor
- For the Emperor
- For the emperor!
- FOR THE EMPEROR!!!!!!!
- FOR THE EMPEROR!
- FOR THE EMPEROR!!
- FOR THE EMPEROR!!!!!!!!!!!
- For the Emperor!
- For the Emperor
- FOR THE EMPEROR⠀⠀⠀⠀⠀⠀⠀⠀⢀⠨⠭⢭⣭⢭⡥⠭⠕⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⠄⠂⠉⢷⣠⣼⡛⣘⣷⣀⡶⠉⠐⠠⡀⠀⠀⠀⠀⠀⠀⠀⠀⢀⠬⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢧⡀⠀⠀⠀⠀⠀⢀⠇⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢰⡀⠀⠀⠀⠀⡌⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢱⠀⠀⠀⠀⠃⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠸⠀⠀⡔⠊⡀⠀⢰⡤⠄