# Ekşi Sözlük Country Analysis

Simplified, modular analysis using utility modules.

In [2]:
import json

def merge_eksi_data(file_paths: list, target_ids: list, output_path: str):
    merged_data = {}
    
    for file_path in file_paths:
        print(f"Processing {file_path}...")
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            filtered = {
                tid: content for tid, content in data.items()
                if tid in target_ids
            }
            merged_data.update(filtered)
    
    cleaned_data = {}
    for topic_id, content in merged_data.items():
        topic_meta = content.get('topic', {})
        entries = content.get('entries', [])
        
        cleaned_entries = []
        for entry in entries:
            cleaned_entry = {
                'author': entry.get('author', ''),
                'created_at_raw': entry.get('created_at_raw', ''),
                'text': entry.get('text', '')
            }
            cleaned_entries.append(cleaned_entry)
        
        cleaned_data[topic_id] = {
            'topic': {
                'title': topic_meta.get('title', ''),
                'id': topic_id
            },
            'entries': cleaned_entries
        }
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(cleaned_data, f, ensure_ascii=False, indent=2)
    
    print(f"\nMerged data saved to: {output_path}")
    print(f"Total topics: {len(cleaned_data)}")

if __name__ == "__main__":
    INPUT_PATHS = [
        "../datasets/countries.json",
        "../datasets/three_country.json"
    ]
    
    TARGET_IDS = [
        '46687', '43748', '2099924', '58585', '128684', '39870', '64696', 
        '111737', '45394', '68980', '129500', '114173', '55971', '71457', 
        '385546', '86544', '167447', '61614', '96743', '56902', '50310', 
        '66320', '62870', '97065', '80137', '144150', '91933', '93047', 
        '127685', '35513', '55600', '240029', '36897', '57388', '42073', 
        '42072',
        # New countries
        '43722', '49635', '74714'
    ]
    
    OUTPUT_PATH = "../datasets/eksidata.json"
    merge_eksi_data(INPUT_PATHS, TARGET_IDS, OUTPUT_PATH)

Processing ../datasets/countries.json...
Processing ../datasets/three_country.json...

Merged data saved to: ../datasets/eksidata.json
Total topics: 39


## 1. Setup and Data Loading

In [None]:
import pandas as pd
import seaborn as sns
from pathlib import Path

from collections import Counter
from nltk.util import bigrams
import plotly.graph_objects as go
import networkx as nx
import numpy as np

from pyscripts.text_utils import tokenize, extract_country_terms, aggregate_tokens_by_group # custom modules
from pyscripts.tfidf_analysis import TfidfAnalyzer, analyze_by_group

sns.set_theme()
DATA_PATH = Path("../datasets/eksidata.json")

## 2. Create Simplified DataFrame

In [4]:
def extract_records(raw_data):
    records = []
    
    def parse_turkish_date(date_str):
        """Parse Turkish date format with error handling"""
        try:
            if '~' in date_str:  # Handle date ranges
                date_str = date_str.split('~')[0].strip()  # Take first date
            return pd.to_datetime(date_str, format='%d.%m.%Y %H:%M')
        except:
            return pd.NaT  # Return Not-a-Time for invalid dates
    
    for topic_id, content in raw_data.items():
        topic_title = content.get('topic', {}).get('title', '')
        
        for entry in content.get('entries', []):
            record = {
                'topic': topic_title,
                'author': entry.get('author', ''),
                'date': parse_turkish_date(entry.get('created_at_raw', '')),
                'text': entry.get('text', '')
            }
            records.append(record)
    
    df = pd.DataFrame(records)
    print(f"Found {df['date'].isna().sum()} invalid dates")
    
    return df

## 3. Text Processing and Tokenization

In [None]:
# Apply tokenization using our utility function
# The tokenize() function uses:
# - Custom Turkish stopwords (BASE + DOMAIN specific)
# - BKZ pattern removal (ekşi sözlük cross-references)
# - GÖRSEL repetition removal (image placeholders)
# - Apostrophe normalization for Turkish possessives
# - Minimum token length: 2
df = extract_records(json.load(open(DATA_PATH, 'r', encoding='utf-8')))

df['tokens'] = df['text'].fillna("").apply(tokenize)

# Quick stats
df['token_count'] = df['tokens'].apply(len)
print(f"Average tokens per entry: {df['token_count'].mean():.1f}")
print(f"Median tokens per entry: {df['token_count'].median():.1f}")
print(f"Total unique authors: {df['author'].nunique()}")
print(f"Total unique topics: {df['topic'].nunique()}")

# Show stopword info
from pyscripts.text_utils import get_turkish_stopwords
stopwords = get_turkish_stopwords(include_domain_stopwords=True)
print(f"\nTotal stopwords used: {len(stopwords)}")

Found 16 invalid dates
Average tokens per entry: 65.1
Median tokens per entry: 25.0
Total unique authors: 28094
Total unique topics: 39

Total stopwords used: 228


## 4. TF-IDF Analysis by Topic

In [None]:
# Prepare topic-level documents
df_with_text = df.copy()
df_with_text['token_text'] = df_with_text['tokens'].apply(lambda x: ' '.join(x))

# Get complete filter terms once

stopwords = get_turkish_stopwords(include_domain_stopwords=False)
country_terms = extract_country_terms(df['topic'].unique())
filter_terms = stopwords.union(country_terms)
print(f"\nSome filter terms: {list(filter_terms)[:20]} ...")

FILTERED_TERMS = filter_terms  # Save for reuse

# Initialize analyzer with GPU support
analyzer = TfidfAnalyzer(min_df=2, max_df=0.3, max_features=2000, use_gpu=True)

# Run analysis
tfidf_matrix, top_terms = analyze_by_group(
    df=df_with_text,
    group_col='topic',
    text_col='token_text',
    analyzer=analyzer,
    top_n=10,
    filter_terms=FILTERED_TERMS  # Use the prepared filter terms
)

# Save to CSV with UTF-8 encoding
output_file = "country_topics_tfidf_top_terms.csv"
top_terms.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"\nResults saved to: {output_file}")

print(f"Top terms shape: {top_terms.shape}")
print(f"Topics analyzed: {top_terms['topic'].nunique()}")
top_terms.head(20)


Some filter terms: ['dahi', 'the', 'bunda', 'diğer', 'ayrıca', 'hakkında', 'sr', 'var', 'şunun', 'iran', 'yapıyor', 'biz', 'kıbrıs', 'çok', 'kore', 'hayli', 'da', 'israil', 'ki', 'göre'] ...




TF-IDF processing time: 2.27 seconds
Shape: 39 documents × 386 terms
GPU acceleration: enabled

Results saved to: country_topics_tfidf_top_terms.csv
Top terms shape: (390, 3)
Topics analyzed: 39


Unnamed: 0,topic,token,tfidf
0,almanya,bi,0.059472
1,almanya,büyük,0.100018
2,almanya,ev,0.053815
3,almanya,gün,0.058272
4,almanya,ilk,0.063048
5,almanya,iyi,0.1093
6,almanya,iş,0.085935
7,almanya,tek,0.055482
8,almanya,türk,0.09315
9,almanya,vs,0.06235


## 5. Visualization: Top Terms per Topic

In [7]:
import plotly.express as px

def plot_topic_terms_treemap(top_terms_df, n_terms=10, filter_terms=None):
    """Plot interactive treemap of top terms per topic."""
    if filter_terms:
        data = top_terms_df[~top_terms_df['token'].isin(filter_terms)]
    else:
        data = top_terms_df
        
    data = data.groupby('topic').head(n_terms)
    
    fig = px.treemap(
    data,
    path=['topic', 'token'],
    values='tfidf',
    color='topic',
    color_discrete_sequence=px.colors.qualitative.Set3,
    title="Top Terms per Topic (Interactive Treemap)"
)
    fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
    fig.show()

plot_topic_terms_treemap(top_terms, n_terms=10, filter_terms=FILTERED_TERMS)

## 6. Analysis by Author (Optional)

In [8]:
# Filter to top authors (with at least 20 entries)
author_counts = df['author'].value_counts()
top_authors = author_counts[author_counts >= 20].index

df_top_authors = df[df['author'].isin(top_authors)].copy()
df_top_authors['token_text'] = df_top_authors['tokens'].apply(lambda x: ' '.join(x))

print(f"Analyzing {len(top_authors)} authors with 20+ entries")

# Analyze by author
analyzer_author = TfidfAnalyzer(min_df=2, max_features=1000, use_gpu=True)
_, top_terms_by_author = analyze_by_group(
    df=df_top_authors,
    group_col='author',
    text_col='token_text',
    analyzer=analyzer_author,
    top_n=10,
    filter_terms=country_terms
)

# Show sample
print("\nSample author-specific terms:")
for author in top_authors[:5]:
    terms = top_terms_by_author[top_terms_by_author['author'] == author].nlargest(5, 'tfidf')
    terms_str = ", ".join([f"{row['token']} ({row['tfidf']:.3f})" for _, row in terms.iterrows()])
    print(f"  {author}: {terms_str}")

Analyzing 649 authors with 20+ entries



The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



TF-IDF processing time: 0.79 seconds
Shape: 649 documents × 978 terms
GPU acceleration: enabled

Sample author-specific terms:
  sanver: abd (0.176), analiz (0.138), siyasal (0.127), jeopolitik (0.125), ab (0.116)
  bm778: abd (0.520), vs (0.240), ülkedir (0.233), çin (0.169), kendi (0.137)
  osbirci31: amerika (0.257), filan (0.219), dolar (0.182), abd (0.178), sürekli (0.155)
  buyuyup buyuk adam olacaktim: abd (0.599), trump (0.257), çin (0.229), avrupa (0.202), iyi (0.113)
  sanfransiskolu mufettis: abd (0.404), çin (0.234), california (0.175), amerikan (0.170), dolar (0.146)


## 7. Export Results

In [9]:
# Export top terms by topic with UTF-8 encoding
output_file = "top_tfidf_terms_by_topic_clean.csv"
top_terms.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"Results exported to: {output_file}")
print(f"Encoding: UTF-8 with BOM (Excel-compatible)")

Results exported to: top_tfidf_terms_by_topic_clean.csv
Encoding: UTF-8 with BOM (Excel-compatible)


## 8. Extract Bigrams

In [10]:
def extract_bigrams(tokens, min_freq=5):
    bigram_list = list(bigrams(tokens))
    
    # Count bigrams
    bigram_counts = Counter(bigram_list)
    
    # Filter by minimum frequency
    filtered_bigrams = {b: c for b, c in bigram_counts.items() if c >= min_freq}
    
    # Convert to DataFrame
    bigram_df = pd.DataFrame([
        {
            'bigram': ' '.join(bg), 
            'count': count, 
            'word1': bg[0], 
            'word2': bg[1]
        }
        for bg, count in filtered_bigrams.items()
    ])
    
    return bigram_df.sort_values('count', ascending=False).reset_index(drop=True)


## 9. Analyze Bigrams by Topic

In [11]:
# Analyze bigrams by topic
topic_bigrams = {}

print("Extracting bigrams for each topic...")
for topic in df['topic'].unique():
    # Get all tokens for this topic
    topic_tokens = df[df['topic'] == topic]['tokens'].sum()
    
    # Extract bigrams (minimum frequency = 3)
    bigram_df = extract_bigrams(topic_tokens, min_freq=3)
    
    # Store results
    topic_bigrams[topic] = bigram_df

# Show summary statistics
print(f"\nBigram extraction complete!")
print(f"Topics analyzed: {len(topic_bigrams)}")
print(f"\nBigrams per topic:")
for topic, bigrams in topic_bigrams.items():
    print(f"  {topic}: {len(bigrams)} bigrams")


Extracting bigrams for each topic...

Bigram extraction complete!
Topics analyzed: 39

Bigrams per topic:
  fransa: 1379 bigrams
  rusya: 6659 bigrams
  çin: 467 bigrams
  kanada: 2601 bigrams
  suudi arabistan: 944 bigrams
  yunanistan: 4365 bigrams
  şili: 111 bigrams
  ermenistan: 1625 bigrams
  iran: 5350 bigrams
  israil: 11650 bigrams
  somali: 173 bigrams
  cezayir: 93 bigrams
  kazakistan: 677 bigrams
  moğolistan: 53 bigrams
  lihtenştayn: 44 bigrams
  yeni zelanda: 208 bigrams
  angola: 19 bigrams
  venezuela: 777 bigrams
  el salvador: 42 bigrams
  vietnam: 102 bigrams
  japonya: 6421 bigrams
  kuzey kore: 120 bigrams
  güney kore: 752 bigrams
  filistin: 1556 bigrams
  ukrayna: 2940 bigrams
  sırbistan: 215 bigrams
  gürcistan: 558 bigrams
  makedonya: 283 bigrams
  bosna-hersek: 527 bigrams
  isviçre: 938 bigrams
  ırak: 269 bigrams
  kürdistan: 683 bigrams
  kıbrıs: 560 bigrams
  suriye: 3294 bigrams
  pakistan: 457 bigrams
  hindistan: 972 bigrams
  türkiye: 25846 bigram

In [12]:
def plot_bigram_network(bigram_df, n_bigrams=50, title="Top Bigrams Network"):
    """
    Create interactive network visualization of top bigrams using NetworkX and Plotly.
    
    Parameters:
    -----------
    bigram_df : pd.DataFrame
        DataFrame with columns: word1, word2, count
    n_bigrams : int, default=50
        Number of top bigrams to visualize
    title : str
        Plot title
    """
    # Get top N bigrams
    top_bigrams = bigram_df.head(n_bigrams)
    
    if len(top_bigrams) == 0:
        print("No bigrams to visualize")
        return
    
    G = nx.DiGraph()
    
    for _, row in top_bigrams.iterrows():
        G.add_edge(row['word1'], row['word2'], weight=row['count'])
    
    pos = nx.spring_layout(G, k=0.5, iterations=50, seed=42)
    
    edge_trace = []
    for edge in G.edges(data=True):
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        weight = edge[2]['weight']
        
        edge_trace.append(
            go.Scatter(
                x=[x0, x1, None],
                y=[y0, y1, None],
                mode='lines',
                line=dict(width=np.log1p(weight) * 0.5, color='#888'),
                hoverinfo='text',
                text=f"{edge[0]} → {edge[1]}: {weight}",
                showlegend=False
            )
        )
    
    node_x = []
    node_y = []
    node_text = []
    node_size = []

    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)

        degree = G.degree(node)
        node_size.append(10 + degree * 3)
        node_text.append(f"{node}<br>Connections: {degree}")
        
    node_trace = go.Scatter(
        x=node_x,
        y=node_y,
        mode='markers+text',
        marker=dict(
            size=node_size,
            color='lightblue',
            line=dict(width=2, color='darkblue')
        ),
        text=[pos_node for pos_node in G.nodes()],
        textposition="top center",
        hoverinfo='text',
        hovertext=node_text,
        showlegend=False
    )
    
    fig = go.Figure(data=edge_trace + [node_trace])
    
    fig.update_layout(
        title=title,
        showlegend=False,
        hovermode='closest',
        margin=dict(b=20, l=5, r=5, t=40),
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        height=600,
        plot_bgcolor='white'
    )
    
    fig.show()


print("Bigram analysis functions loaded successfully")
print("Available functions:")
print("  - extract_bigrams(tokens, min_freq=5)")
print("  - plot_bigram_network(bigram_df, n_bigrams=50, title='...')")


Bigram analysis functions loaded successfully
Available functions:
  - extract_bigrams(tokens, min_freq=5)
  - plot_bigram_network(bigram_df, n_bigrams=50, title='...')


In [18]:
all_bigrams = []
for topic, bigram_df in topic_bigrams.items():
    topic_bigrams_with_source = bigram_df.copy()
    topic_bigrams_with_source['topic'] = topic
    all_bigrams.append(topic_bigrams_with_source)

# Combine all bigrams
combined_df = pd.concat(all_bigrams, ignore_index=True)

# Aggregate counts across topics
aggregated_df = (combined_df
    .groupby(['word1', 'word2'])
    .agg({
        'count': 'sum',
        'topic': lambda x: ' | '.join(sorted(set(x)))  # Keep track of source topics
    })
    .reset_index())

plot_bigram_network(
    bigram_df=aggregated_df,
    n_bigrams=200,
    title="Combined Bigram Network Across All Countries"
)


In [15]:
# create the network visualization for Turkey only
plot_bigram_network(
    bigram_df=topic_bigrams.get('türkiye', pd.DataFrame(columns=['word1', 'word2', 'count'])),
    n_bigrams=500,
    title="Bigram Network for Türkiye"
)