In [104]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import nltk

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
json_file_name = r"C:\Users\E.Sahin\Downloads\eksi_party_entries\eksi_party_entries.jsonl"
df = pd.read_json(json_file_name, lines=True, encoding='utf-8')
initial_count = len(df)
print(f"Initial Dataset Size: {initial_count} entries")

# Topic filtering
selected_topics = ['recep tayyip erdoğan', 'kemal kılıçdaroğlu']
df_filtered = df[df['topic'].isin(selected_topics)]
topics_removed = initial_count - len(df_filtered)
print(f"Entries removed by topic filtering: {topics_removed}")
print(f"Remaining entries: {len(df_filtered)}")

# Remove overactive authors - Fixed grouping logic
# First, count entries per author per topic
author_counts = df_filtered.groupby(['topic', 'author_id']).size().reset_index(name='entry_count')

# Calculate 99th percentile of entry counts per topic
percentiles = author_counts.groupby('topic')['entry_count'].quantile(0.99)

# Identify overactive authors
overactive_authors = []
for topic in selected_topics:
    topic_threshold = percentiles[topic]
    topic_overactive = author_counts[
        (author_counts['topic'] == topic) & 
        (author_counts['entry_count'] > topic_threshold)
    ]['author_id'].tolist()
    overactive_authors.extend(topic_overactive)

# Convert to unique integer array
overactive_authors = np.unique(overactive_authors).astype(int)

before_overactive = len(df_filtered)
df_filtered = df_filtered[~df_filtered['author_id'].isin(overactive_authors)]
overactive_removed = before_overactive - len(df_filtered)
print(f"\nOveractive authors identified: {len(overactive_authors)}")
print(f"Entries removed from overactive authors: {overactive_removed}")
print(f"Remaining entries: {len(df_filtered)}")

# Remove URL-only entries
before_urls = len(df_filtered)
url_pattern = re.compile(r'^https?:\/\/\S+$')
df_filtered = df_filtered[~df_filtered['entry_text'].apply(lambda x: bool(url_pattern.match(str(x))))]
urls_removed = before_urls - len(df_filtered)
print(f"\nURL-only entries removed: {urls_removed}")
print(f"Remaining entries: {len(df_filtered)}")

# Remove reference-only entries
before_refs = len(df_filtered)
reference_pattern = re.compile(r'^\(bkz: .*?\)$')
df_filtered = df_filtered[~df_filtered['entry_text'].apply(lambda x: bool(reference_pattern.match(str(x))))]
refs_removed = before_refs - len(df_filtered)
print(f"\nReference-only entries removed: {refs_removed}")
print(f"Remaining entries: {len(df_filtered)}")

# Length filtering
before_length = len(df_filtered)
df_filtered['entry_length'] = df_filtered['entry_text'].apply(lambda x: len(str(x)))
df_filtered = df_filtered[(df_filtered['entry_length'] >= 35) & (df_filtered['entry_length'] <= 1407)]
length_removed = before_length - len(df_filtered)
print(f"\nEntries removed by length filtering: {length_removed}")
print(f"Remaining entries: {len(df_filtered)}")

# Text preprocessing
stop_words = set(stopwords.words('turkish'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Preprocess the input text by performing the following steps:
    1. Lowercasing the text.
    2. Replacing '$' with 'ş' when adjacent to word characters.
    3. Removing URLs.
    4. Removing punctuation.
    5. Removing stop words.
    6. Lemmatizing the words.
    
    Parameters:
    text (str): The input text to preprocess.
    
    Returns:
    str: The preprocessed text.
    """
    # 1. Lowercase the text
    text = text.lower()

    
    # 7. Replace 'bkz:' with desired replacement (e.g., remove it)
    text = re.sub(r'bkz:', '', text)
    
    # 2. Replace '$' with 'ş' if adjacent to a word character
    # This ensures that only '$' within words are replaced
    text = re.sub(r'(?<=\w)\$|\$(?=\w)', 'ş', text)
    
    # 3. Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # 4. Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # 5. Remove stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # 6. Lemmatize the words
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])


    return text

# Create processed text column
df_filtered['processed_text'] = df_filtered['entry_text'].apply(preprocess_text)

# Remove the entry_length column
df_filtered = df_filtered.drop('entry_length', axis=1)

# Prepare final datasets
bert_dataset = df_filtered[['entry_id', 'entry_date', 'topic', 'author_id', 'entry_text']]
traditional_ml_dataset = df_filtered[['entry_id', 'entry_date','topic', 'author_id', 'entry_text','processed_text']]

# Save datasets as Excel files
bert_dataset.to_excel('filtered_ekşi_sozluk_bert.xlsx', index=False)
traditional_ml_dataset.to_excel('filtered_ekşi_sozluk_traditional_ml.xlsx', index=False)

# Print final summary
print("\nFinal Summary:")
print(f"Total entries removed: {initial_count - len(df_filtered)}")
print(f"Final dataset size: {len(df_filtered)}")
print("\nDatasets saved as:")
print("- filtered_ekşi_sozluk_bert.csv")
print("- filtered_ekşi_sozluk_traditional_ml.csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\E.Sahin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\E.Sahin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Initial Dataset Size: 221850 entries
Entries removed by topic filtering: 67968
Remaining entries: 153882

Overactive authors identified: 406
Entries removed from overactive authors: 30525
Remaining entries: 123357

URL-only entries removed: 376
Remaining entries: 122981

Reference-only entries removed: 4066
Remaining entries: 118915

Entries removed by length filtering: 11483
Remaining entries: 107432

Final Summary:
Total entries removed: 114418
Final dataset size: 107432

Datasets saved as:
- filtered_ekşi_sozluk_bert.csv
- filtered_ekşi_sozluk_traditional_ml.csv


In [101]:
traditional_ml_dataset.head()


Unnamed: 0,entry_id,entry_date,topic,author_id,entry_text,processed_text
3,9145023,2006-02-17 19:22:00,recep tayyip erdoğan,8099,me$gul ettigi makamin gerekli gordugu vasiflar...,meşgul ettigi makamin gerekli gordugu vasiflar...
5,10006296,2006-05-09 21:38:00,recep tayyip erdoğan,8099,kendisine cok seri laflar hazirladigim adam. i...,kendisine cok seri laflar hazirladigim adam iş...
7,49236290,2015-02-17 09:41:00,recep tayyip erdoğan,8099,her beyanını işittiğimde aklıma die hard'ın so...,beyanını işittiğimde aklıma die hardın sonlari...
8,52082664,2015-08-06 11:58:00,recep tayyip erdoğan,8099,hakkında bu kadar entry girilince konuştu sand...,hakkında kadar entry girilince konuştu sandım ...
9,123740137,2021-05-25 05:54:00,recep tayyip erdoğan,8099,hakkındaki piyon olduğu iddiası putin'i hatırl...,hakkındaki piyon olduğu iddiası putini hatırla...


In [98]:
entries_by_topic = bert_dataset.groupby('topic').size().reset_index(name='entry_count')
print(entries_by_topic)

                  topic  entry_count
0    kemal kılıçdaroğlu        50113
1  recep tayyip erdoğan        57319
