In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import pandas as pd
import numpy as np

In [3]:
df= pd.read_csv("lemmatized_sentences.csv")

In [5]:
df

Unnamed: 0,simply incredible
0,essential patch first month addition immortal ...
1,game game game game official release date
2,still game game game game game game game game ...
3,still launch properly freeze launch otherwise ...
4,got game cdkeys ca even get refund either
...,...
3456,wish often struggle navigation
3457,relying online guide find way slightly detract...
3458,first four chapter solid offering plenty conte...
3459,however chapter five six feel bit rushed could...


In [7]:
df.columns = ["0"]

# NaN değerleri ve boş stringleri temizle
df = df.dropna()
df = df[df["0"].str.strip() != ""]

# Temizlenmiş ilk 5 cümleyi göster
print(df.head())

                                                   0
0  essential patch first month addition immortal ...
1          game game game game official release date
2  still game game game game game game game game ...
3  still launch properly freeze launch otherwise ...
4          got game cdkeys ca even get refund either


In [9]:
#Varsayalım ki DataFrame'iniz df ve yorumlar '0' sütununda
text = ' '.join(df['0'].astype(str).tolist())  # Tüm yorumları tek bir metin haline getir
sentences = sent_tokenize(text)  # Metni cümlelere ayır

In [11]:
# Stopwords listesini almak
stop_words = set(stopwords.words('english'))

In [13]:
# Kelimeleri tokenleştirip, lemmatize etme ve stemleme
def preprocess_sentence(sentence):
    tokens = word_tokenize(sentence)  # Cümleyi kelimelere ayır
    # Sadece harf olan kelimeleri al ve stopword'leri çıkar
    filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]  # Lemmatize etme
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]  # Stemleme
    
    return lemmatized_tokens, stemmed_tokens

In [15]:
# Her cümleyi tokenleştir, lemmatize et ve stemle
tokenized_corpus_lemmatized = []
tokenized_corpus_stemmed = []

In [19]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Gerekli verileri indir
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Tanımlamalar (Eksik olanlar burada)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eren\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eren\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eren\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
for sentence in sentences:
    lemmatized_tokens, stemmed_tokens = preprocess_sentence(sentence)
    tokenized_corpus_lemmatized.append(lemmatized_tokens)
    tokenized_corpus_stemmed.append(stemmed_tokens)

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
# Ön işlenmiş token listelerini tekrar metne çeviriyoruz
lemmatized_texts = [' '.join(tokens) for tokens in tokenized_corpus_lemmatized]

lemmatized_texts[:3]



In [27]:
# TF-IDF vektörizerı başlatıyoruz
vectorizer = TfidfVectorizer()

# TF-IDF matrisini oluşturuyoruz
#terim frekansları, belge frekanslarıni hesplar
#TF-IDF vektörlerine dönüştürür
tfidf_matrix = vectorizer.fit_transform(lemmatized_texts)

## Kelimeleri alalım
#F-IDF vektörleştirme işleminde kullanılan tüm kelimelerin essiz bir listesini döndürur
feature_names = vectorizer.get_feature_names_out()

# TF-IDF matrisini pandas DataFrame'e çevir-gorunurluk acisindan- calismasi kolay
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# İlk birkaç satırı gösterelim-ilk 5 cümle
print(tfidf_df.head())

#Her satır bir cümleyi temsil eder
#Her sütun bir kelimeyi temsil eder
#Hücreler ise o kelimenin o cümledeki TF-IDF skorudur - her cumle icin degisir-bakiniz:slaytlar

        aaa  \
0  0.003432   

   aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa

In [29]:
tfidf_df.to_csv('tfidf_lemmatized1.csv', index=False)

In [31]:
# İlk cümle için TF-IDF skorlarını al
first_sentence_vector = tfidf_df.iloc[0]

# Skorlara göre sırala (yüksekten düşüğe)
top_5_words = first_sentence_vector.sort_values(ascending=False).head(15)

# Sonucu yazdır
print("İlk cümlede en yüksek TF-IDF skoruna sahip 5 kelime:")
print(top_5_words)

İlk cümlede en yüksek TF-IDF skoruna sahip 5 kelime:
game      0.758876
like      0.170110
get       0.130401
time      0.114224
even      0.113733
play      0.106870
good      0.106380
bos       0.098046
feel      0.095595
one       0.094614
fun       0.087751
player    0.081868
really    0.079908
new       0.078927
make      0.077947
Name: 0, dtype: float64


In [33]:
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
# player kelimesinin vektörünü alalım
player_index = feature_names.tolist().index('player')  # 'room' kelimesinin indeksini bul

# player kelimesinin TF-IDF vektörünü alıyoruz ve 2D formatta yapıyoruz
player_vector = tfidf_matrix[:, player_index].toarray()

# Tüm kelimelerin TF-IDF vektörlerini alıyoruz
tfidf_vectors = tfidf_matrix.toarray()

# Cosine similarity hesaplayalım
similarities = cosine_similarity(player_vector.T, tfidf_vectors.T)

# Benzerlikleri sıralayalım ve en yüksek 5 kelimeyi seçelim
similarities = similarities.flatten()
top_5_indices = similarities.argsort()[-6:][::-1]  # 6. en büyükten başlıyoruz çünkü kendisi de dahil

# Sonuçları yazdıralım
for index in top_5_indices:
    print(f"{feature_names[index]}: {similarities[index]:.4f}")

𝘳𝘶𝘪𝘯𝘦𝘥: 1.0000
goody: 1.0000
flc: 1.0000
flee: 1.0000
fleshed: 1.0000
flexing: 1.0000
