In [None]:
# --- 1) Import Libraries and set file paths for EDA ---
import json
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
# import zeyrek #TODO: later analyze for morphological features

sns.set_theme()
DATA_PATH = Path("../datasets/countries.json") # includes more than 10 country entries from eksisozluk



In [16]:
target_urls = [
  "https://eksisozluk.com/fransa--46687",
  "https://eksisozluk.com/rusya--43748",
  "https://eksisozluk.com/cin--2099924",
  "https://eksisozluk.com/kanada--58585",
  "https://eksisozluk.com/suudi-arabistan--128684",
  "https://eksisozluk.com/yunanistan--39870",
  "https://eksisozluk.com/sili--64696",
  "https://eksisozluk.com/ermenistan--111737",
  "https://eksisozluk.com/iran--45394",
  "https://eksisozluk.com/israil--68980",
  "https://eksisozluk.com/somali--129500",
  "https://eksisozluk.com/cezayir--114173",
  "https://eksisozluk.com/kazakistan--55971",
  "https://eksisozluk.com/mogolistan--71457",
  "https://eksisozluk.com/lihtenstayn--385546",
  "https://eksisozluk.com/yeni-zelanda--86544",
  "https://eksisozluk.com/angola--167447",
  "https://eksisozluk.com/venezuela--61614",
  "https://eksisozluk.com/el-salvador--96743",
  "https://eksisozluk.com/vietnam--56902",
  "https://eksisozluk.com/japonya--50310",
  "https://eksisozluk.com/kuzey-kore--66320",
  "https://eksisozluk.com/guney-kore--62870",
  "https://eksisozluk.com/filistin--97065",
  "https://eksisozluk.com/ukrayna--80137",
  "https://eksisozluk.com/sirbistan--144150",
  "https://eksisozluk.com/gurcistan--91933",
  "https://eksisozluk.com/makedonya--93047",
  "https://eksisozluk.com/bosna-hersek--127685",
  "https://eksisozluk.com/isvicre--35513",
  "https://eksisozluk.com/irak--55600",
  "https://eksisozluk.com/kurdistan--240029",
  "https://eksisozluk.com/kibris--36897",
  "https://eksisozluk.com/suriye--57388",
  "https://eksisozluk.com/pakistan--42073",
  "https://eksisozluk.com/hindistan--42072"
]

In [18]:
# --- 2) Extract the numeric ids from each URL ---

TARGET_IDS = [url.split("--")[-1] for url in target_urls]
print(f'Target IDs: {TARGET_IDS}')

Target IDs: ['46687', '43748', '2099924', '58585', '128684', '39870', '64696', '111737', '45394', '68980', '129500', '114173', '55971', '71457', '385546', '86544', '167447', '61614', '96743', '56902', '50310', '66320', '62870', '97065', '80137', '144150', '91933', '93047', '127685', '35513', '55600', '240029', '36897', '57388', '42073', '42072']


In [22]:
# --- 3) Load JSON data from file and verify successful loading ---
with DATA_PATH.open() as f:
    raw = json.load(f)
    if not raw:
        raise ValueError("Failed to load JSON data from file.")
    else:
        print("JSON data loaded successfully.")


JSON data loaded successfully.


In [23]:
# --- 4) Explore the top-level structure of the JSON data ---
raw = {tid: blob for tid, blob in raw.items() if tid in TARGET_IDS}
print(f'Filtered raw data to include only target IDs. The length: {len(raw)}')


Filtered raw data to include only target IDs. The length: 36


In [None]:
# --- 5) Structure and Sample Data Inspection ---

records = []

for topic_id, topic_blob in raw.items():
    topic_meta = topic_blob.get("topic", {})
    entries = topic_blob.get("entries", [])
    
    for entry in entries:
        row = {
            'topic_id': topic_id,
            'topic_title': topic_meta.get('title', ''),
            'topic_url': topic_meta.get('url', ''),
            'entry_count_reported': topic_meta.get('entry_count_reported', 0),
            'entry_id': entry.get('id', ''),
            'author': entry.get('author', ''),
            'favorites': entry.get('favorites', 0),
            'created_at': entry.get('created_at', ''),
            'created_at_raw': entry.get('created_at_raw', 0),
            'text': entry.get('text', ''),
            'entry_url': entry.get('url', '')
            
        }
        records.append(row)
        
df =pd.DataFrame(records)
print(f'DataFrame shape: {df.shape}')
print(df.head())

DataFrame shape: (57470, 11)
  topic_id topic_title                             topic_url  \
0    46687      fransa  https://eksisozluk.com/fransa--46687   
1    46687      fransa  https://eksisozluk.com/fransa--46687   
2    46687      fransa  https://eksisozluk.com/fransa--46687   
3    46687      fransa  https://eksisozluk.com/fransa--46687   
4    46687      fransa  https://eksisozluk.com/fransa--46687   

   entry_count_reported entry_id    author  favorites           created_at  \
0                     0               tazz          0  1999-11-07T18:59:00   
1                     0              encre          0                 None   
2                     0            suicyco          0  2000-05-30T21:54:00   
3                     0               down          0                 None   
4                     0           chuinase          0  2002-04-18T12:24:00   

                        created_at_raw  \
0                     07.11.1999 18:59   
1  20.05.2000 19:17 ~ 19.11.2003 

In [32]:
# --- 6) Inspect Structure and Missing Values ---

df.info()
df.isna().mean().sort_values(ascending=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57470 entries, 0 to 57469
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   topic_id              57470 non-null  object
 1   topic_title           57470 non-null  object
 2   topic_url             57470 non-null  object
 3   entry_count_reported  57470 non-null  int64 
 4   entry_id              57470 non-null  object
 5   author                57470 non-null  object
 6   favorites             57470 non-null  int64 
 7   created_at            42677 non-null  object
 8   created_at_raw        57470 non-null  object
 9   text                  57470 non-null  object
 10  entry_url             57470 non-null  object
dtypes: int64(2), object(9)
memory usage: 4.8+ MB


created_at              0.257404
topic_id                0.000000
topic_title             0.000000
topic_url               0.000000
entry_count_reported    0.000000
entry_id                0.000000
author                  0.000000
favorites               0.000000
created_at_raw          0.000000
text                    0.000000
entry_url               0.000000
dtype: float64

In [35]:
print("entries:", df.shape[0])
print("topics:", df["topic_id"].nunique())
print("authors:", df["author"].nunique())
print("duplicate entry_ids:", df["entry_id"].duplicated().sum())

df["text_len"] = df["text"].str.len()
df["word_count"] = df["text"].str.split().str.len()

df[["favorites", "text_len", "word_count"]].describe()


entries: 57470
topics: 36
authors: 19672
duplicate entry_ids: 57469


Unnamed: 0,favorites,text_len,word_count
count,57470.0,57470.0,57470.0
mean,0.0,598.140056,79.833043
std,0.0,1539.318133,204.043316
min,0.0,1.0,1.0
25%,0.0,104.0,13.0
50%,0.0,226.0,30.0
75%,0.0,538.0,73.0
max,0.0,81121.0,10170.0


In [None]:
def tokenize(text): return [tok for tok in re.findall(r"[a-züğışöçâîû]+(?:'[a-züğışöçâîû]+)?", text.lower()) if len(tok) > 2]


# apply stop words 
tokens = df['text'].dropna().map(tokenize)
unigram_counts = Counter(tok for toks in tokens for tok in toks)
pd.Series(unigram_counts).sort_values(ascending=False).head(20)
bigram_counts = Counter(
    (toks[i], toks[i + 1]) 
    for toks in tokens 
    for i in range(len(toks) - 1)
)
pd.Series(bigram_counts).sort_values(ascending=False).head(20)

# we have a lot of URLs in the text data; extract and analyze them

bir       ülke       3835
http      www        3138
bir       şey        3066
https     www        2952
bir       şekilde    2168
gibi      bir        2065
büyük     bir        1820
olan      ülke       1763
başka     bir        1633
böyle     bir        1599
daha      fazla      1434
herhangi  bir        1339
çok       daha       1320
bir       devlet     1240
olduğu    için       1221
olan      bir        1200
bir       yer        1091
youtube   com        1062
diye      bir        1045
www       youtube    1008
dtype: int64

In [23]:
url_pattern = re.compile(r'https?://[^\s]+|www\.[^\s]+')
df['urls'] = df['text'].apply(lambda x: url_pattern.findall(str(x)) if pd.notna(x) else [])
df['has_url'] = df['urls'].apply(lambda x: len(x) > 0)
print(f"Entries with URLs: {df['has_url'].sum()} ({df['has_url'].mean()*100:.2f}%)")
df[df['has_url']].head(10)[['text', 'urls']]


Entries with URLs: 8719 (13.83%)


Unnamed: 0,text,urls
181,(bkz:\nmhp'nin akpkk tweeti\n)\nhttps://x.com/...,[https://x.com/mhp_bilgi/status/43550818577732...
518,https://x.com/…rbandicom/status/19791015344419...,[https://x.com/…rbandicom/status/1979101534441...
733,"bu kişinin hasta olabileceğini, tıbbi müdahale...",[https://www.bbc.com/…rkce/articles/cml41e1g00...
738,bana şunu hatırlattı.\nhttps://youtu.be/uhuj1e...,[https://youtu.be/uhuj1e_ti_q?si=y87gfuky8jhow...
1251,nereden nereye geldi türkiye diye bahçeli'nin ...,[https://www.youtube.com/shorts/y4jqf8mnga4]
1334,daha önce de yazmıştım.\nhttps://api.whatsapp....,[https://api.whatsapp.com/send?phone=905xxxxxx...
1565,baraj doluluk oranı %0 olmuş. yani bursa'da ba...,[https://x.com/…=twsrc^google|twcamp^serp|twgr...
1598,çok acı.. alıştırılmaya çalıştırıldığımız şeyi...,[https://www.buski.gov.tr/…rununu-kokunden-coz...
2010,"fatih terim'e de sordular röportajda, bazen ol...",[https://youtu.be/oikw6p5ryho?si=nyo2svuzczsna...
2289,göcekten selamlar\nhttps://eksiup.com/p/2a4419...,[https://eksiup.com/p/2a44197hckz2]
