In [None]:
# Verisetine ID ekledik.

In [1]:
import pandas as pd

# Dosyayı okuma
file_path = 'filtered_reviews.txt'

# Dosyayı açma ve okuma
with open(file_path, 'r') as file:
    comments = file.readlines()

# "nth comment:" ifadesini kaldırma ve ID oluşturma
comments_cleaned = [comment.split(":", 1)[-1].strip() for comment in comments]  # "nth comment:" kısmını kaldırıyoruz

# ID ekleyerek yeni veri yapısını oluşturma
data = {'ID': range(1, len(comments_cleaned) + 1), 'Comment': comments_cleaned}

# DataFrame oluşturma
df = pd.DataFrame(data)

# CSV'ye kaydetme
output_file_path = 'comments_with_ids.csv'
df.to_csv(output_file_path, index=False)

print(f"CSV dosyası başarıyla oluşturuldu: {output_file_path}")
df.head(10)


CSV dosyası başarıyla oluşturuldu: comments_with_ids.csv


Unnamed: 0,ID,Comment
0,1,Update: Now in video form
1,2,call me by your name’s cum peach walked so par...
2,3,Another Bong hit.
3,4,Our expectations were high but HOLY FUCK
4,5,a question to people who rate this 4.5: what m...
5,6,One detail I noticed this time around is that ...
6,7,The tent won’t leak. It’s from America.
7,8,The bloody napkin scene....top 3 scenes of all...
8,9,morse code me by your name and i'll morse code...
9,10,maybe the real parasite... was the friends we ...


In [None]:
#Stop-words, Emojileri, özel karakterleri, noktalama işaretlerini temizledik.

In [3]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# NLTK stopwords yükleme (İlk kez çalıştırdığınızda NLTK kaynaklarını indirmeniz gerekebilir)
import nltk

nltk.download('punkt_tab')

nltk.download('punkt')
nltk.download('stopwords')

# Dosyayı okuma
input_file_path = 'comments_with_ids.csv'

# CSV dosyasını okuma
df = pd.read_csv(input_file_path)

# Stopwords ve noktalama işaretleri temizleme fonksiyonu
def clean_text(text):
    # Küçük harfe çevirme
    text = text.lower()

    # Emojileri ve özel karakterleri temizleme
    text = re.sub(r'[^\w\s]', '', text)  # Noktalama işaretlerini kaldırma
    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)  # Emoji temizleme

    # Tokenizasyon (kelimelere ayırma)
    words = word_tokenize(text)

    # Stop-words temizleme
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    # Temizlenmiş metni geri birleştirme
    return ' '.join(filtered_words)

# Her bir yorumu temizleme
df['Cleaned_Comment'] = df['Comment'].apply(clean_text)

# Yeni DataFrame'i kaydetme
output_file_path = 'cleaned_comments.csv'
df.to_csv(output_file_path, index=False)

print(f"Temizlenmiş veriler kaydedildi: {output_file_path}")
df.head(10)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Temizlenmiş veriler kaydedildi: cleaned_comments.csv


Unnamed: 0,ID,Comment,Cleaned_Comment
0,1,Update: Now in video form,update video form
1,2,call me by your name’s cum peach walked so par...,call names cum peach walked parasites killer p...
2,3,Another Bong hit.,another bong hit
3,4,Our expectations were high but HOLY FUCK,expectations high holy fuck
4,5,a question to people who rate this 4.5: what m...,question people rate 45 want literally want
5,6,One detail I noticed this time around is that ...,one detail noticed time around min mr park rea...
6,7,The tent won’t leak. It’s from America.,tent wont leak america
7,8,The bloody napkin scene....top 3 scenes of all...,bloody napkin scenetop 3 scenes time hands
8,9,morse code me by your name and i'll morse code...,morse code name ill morse code mine
9,10,maybe the real parasite... was the friends we ...,maybe real parasite friends made along way


In [None]:
#lemmatization işlemi

In [5]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('wordnet')

# Lemmatizer'ı başlatma
lemmatizer = WordNetLemmatizer()

# Stop-words yükleme
stop_words = set(stopwords.words('english'))

# Dosyayı okuma
input_file_path = 'cleaned_comments.csv'

# CSV dosyasını okuma
df = pd.read_csv(input_file_path)

# Stopwords, noktalama işaretleri, emojiler ve lemmatization işlemi yapan fonksiyon
def clean_and_lemmatize(text):
    # Küçük harfe çevirme
    text = text.lower()

    # Emojileri ve özel karakterleri temizleme
    text = re.sub(r'[^\w\s]', '', text)  # Noktalama işaretlerini kaldırma
    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)  # Emoji temizleme

    # Kelimelere ayırma (tokenization)
    words = word_tokenize(text)

    # Stopwords temizleme ve lemmatization
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    # Temizlenmiş ve lemmatize edilmiş kelimeleri birleştirerek metni oluşturma
    return ' '.join(filtered_words)

# Her bir yorumu temizleme ve lemmatization uygulama
df['Lemmatized_Comment'] = df['Cleaned_Comment'].apply(clean_and_lemmatize)

# Yeni DataFrame'i kaydetme
output_file_path = 'lemmatized_comments.csv'
df.to_csv(output_file_path, index=False)

print(f"Temizlenmiş ve lemmatize edilmiş veriler kaydedildi: {output_file_path}")
df.head(10)


[nltk_data] Downloading package wordnet to /root/nltk_data...


Temizlenmiş ve lemmatize edilmiş veriler kaydedildi: lemmatized_comments.csv


Unnamed: 0,ID,Comment,Cleaned_Comment,Lemmatized_Comment
0,1,Update: Now in video form,update video form,update video form
1,2,call me by your name’s cum peach walked so par...,call names cum peach walked parasites killer p...,call name cum peach walked parasite killer pea...
2,3,Another Bong hit.,another bong hit,another bong hit
3,4,Our expectations were high but HOLY FUCK,expectations high holy fuck,expectation high holy fuck
4,5,a question to people who rate this 4.5: what m...,question people rate 45 want literally want,question people rate 45 want literally want
5,6,One detail I noticed this time around is that ...,one detail noticed time around min mr park rea...,one detail noticed time around min mr park rea...
6,7,The tent won’t leak. It’s from America.,tent wont leak america,tent wont leak america
7,8,The bloody napkin scene....top 3 scenes of all...,bloody napkin scenetop 3 scenes time hands,bloody napkin scenetop 3 scene time hand
8,9,morse code me by your name and i'll morse code...,morse code name ill morse code mine,morse code name ill morse code mine
9,10,maybe the real parasite... was the friends we ...,maybe real parasite friends made along way,maybe real parasite friend made along way


In [7]:
import pandas as pd

# Read the CSV file
file_path = 'lemmatized_comments_with_labels.csv'  # Update this path with the correct file location
comments_df = pd.read_csv(file_path)

# Count the number of each sentiment label
label_counts = comments_df['Label'].value_counts()

# Print the result
print(label_counts)


Label
positive    1579
neutral      584
negative     370
Name: count, dtype: int64
