In [1]:
!pip install thefuzz rapidfuzz pdfplumber

import os
import pandas as pd
import numpy as np
import re
import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from thefuzz import fuzz
from rapidfuzz import process, fuzz as rfuzz
import pdfplumber

# ---------------------
# İl listesi
# ---------------------
iller = [
    "adana","adiyaman","afyonkarahisar","agri","amasya","ankara","antalya","artvin","aydin","balikesir",
    "bilecik","bingol","bitlis","bolu","burdur","bursa","canakkale","cankiri","corum","denizli",
    "diyarbakir","edirne","elazig","erzincan","erzurum","eskisehir","gaziantep","giresun","gumushane","hakkari",
    "hatay","isparta","mersin","istanbul","izmir","kars","kastamonu","kayseri","kirklareli","kirsehir",
    "kocaeli","konya","kutahya","malatya","manisa","kahramanmaras","mardin","mugla","mus","nevsehir",
    "nigde","ordu","rize","sakarya","samsun","siirt","sinop","sivas","tekirdag","tokat",
    "trabzon","tunceli","sanliurfa","usak","van","yozgat","zonguldak","aksaray","bayburt","karaman",
    "kirikkale","batman","sirnak","bartin","ardahan","igdir","yalova","karabuk","kilis","osmaniye","duzce"
]

# ---------------------
# İl kısaltma / varyant mapping
# ---------------------
il_map = {
    "ank": "ankara",
    "ist": "istanbul",
    "izm": "izmir",
    "urfa": "sanliurfa",
    "içel": "mersin",
    "icel": "mersin",
    "adapazari": "sakarya",
    "maras": "kahramanmaras"
}

def normalize_il(word):
    word = word.lower().strip()
    if word in il_map:
        return il_map[word]
    match = process.extractOne(word, iller, scorer=rfuzz.token_sort_ratio)
    if match and match[1] > 80:
        return match[0]
    return word

# ---------------------
# PDF'ten ilçe listesi çekme
# ---------------------
def parse_ilce_pdf(pdf_path):
    ilceler = {}
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")
            current_il = None
            for line in lines:
                line = line.strip().lower()
                if not line:
                    continue
                # İl adı gibi duran satır
                if re.match(r'^[a-zçğıöşü\s]+$', line) and len(line.split()) <= 3:
                    current_il = line
                    if current_il not in ilceler:
                        ilceler[current_il] = []
                else:
                    if current_il:
                        parts = re.split(r'[,\s]+', line)
                        for p in parts:
                            p = p.strip()
                            if len(p) > 1 and p not in ilceler[current_il]:
                                ilceler[current_il].append(p)
    return ilceler

pdf_path = "/kaggle/input/il-ile-mah/Ilce_Listesi.pdf"
ilceler = parse_ilce_pdf(pdf_path)

def normalize_ilce(word, ilceler, il=None):
    word = word.lower().strip()
    if il and il in ilceler:
        candidates = ilceler[il]
    else:
        candidates = [ilce for sublist in ilceler.values() for ilce in sublist]
    if not candidates:
        return word
    match = process.extractOne(word, candidates, scorer=rfuzz.token_sort_ratio)
    if match and match[1] > 80:
        return match[0]
    return word

# ---------------------
# Adres Normalizasyon Fonksiyonu
# ---------------------
def normalize_address(text: str) -> str:
    if pd.isna(text):
        return ''
    text = str(text).lower()
    text = (text.replace('ç','c')
                .replace('ş','s')
                .replace('ı','i')
                .replace('ö','o')
                .replace('ü','u')
                .replace('ğ','g'))

    text = re.sub(r'\b(mah|mah\.|mahallesi|mahalle|mh|mh\.|mhl|mhl\.|m\.|mahlesi|mahal|mhallesi|mahellesi|mahelle|maallesi)\b',' mahallesi ', text)
    text = re.sub(r'\b(sok|sok\.|sokak|sk|sk\.|skk|sokagi|soka|sokar|soklar|sokağı|so\.|so|sokalar|sokk)\b',' sokak ', text)
    text = re.sub(r'\b(cad|cad\.|cadde|cd|cd\.|caddesi|cadd|cadd\.|cde|cads|cds|cads\.|cadessi|caddee|cadesi)\b',' cadde ', text)
    text = re.sub(r'\b(blv|blv\.|bulvar|bulv|bulv\.|bulvr|bulw|bulwar|bulbar|blvr|bulvri|bulvari|bulvarii|bulvaa|bulver)\b',' bulvar ', text)
    text = re.sub(r'\b(no|no\.|num|num\.|numara|numarasi|numr|nmr|nmr\.|n\.|nmra|nmara|nomara|numraa|nomra)\b',' numara ', text)
    text = re.sub(r'\b(apt|apt\.|ap|ap\.|apartman|apart|aprt|aprtmn|apartmn|apartmnt|apartm|apartmani|apartmni|apartaman|apartmenn)\b',' apartman ', text)
    text = re.sub(r'\b(daire|d\.|da|dair|dair\.|dairsi|dairisi|dairisi\.|dairler|dairler\.|k\.|kat|kati|katt)\b',' daire ', text)

    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'[/()\\]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    tokens = text.split()
    normalized_tokens = []
    detected_il = None

    # İl normalizasyonu
    for tok in tokens:
        norm_il = normalize_il(tok)
        if norm_il in iller:
            detected_il = norm_il
            normalized_tokens.append(norm_il)
        else:
            normalized_tokens.append(tok)

    # İlçe normalizasyonu
    final_tokens = []
    for tok in normalized_tokens:
        norm_ilce = normalize_ilce(tok, ilceler, il=detected_il)
        final_tokens.append(norm_ilce)

    return " ".join(final_tokens)

# ---------------------
# Eğitim / test yükle
# ---------------------
train = pd.read_csv("/kaggle/input/hepsiburada-hackathon-kaggle-etabi/train.csv")
test = pd.read_csv("/kaggle/input/hepsiburada-hackathon-kaggle-etabi/test.csv")

train["clean_address"] = train["address"].apply(normalize_address)
test["clean_address"] = test["address"].apply(normalize_address)

# ---------------------
# TF-IDF + Nearest Neighbors (RAM dostu ayarlar)
# ---------------------
vectorizer = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(2, 4),   # 5 yerine 4
    max_features=8000,    # 15000 yerine 8000
    sublinear_tf=True,
    min_df=3,             # nadir n-gram’ları alma
    dtype=np.float32)

X_train = vectorizer.fit_transform(train["clean_address"])
X_test = vectorizer.transform(test["clean_address"])

labels = train["label"].astype("int32").values
train_normalized_arr = train['clean_address'].to_numpy()
test_normalized_arr = test['clean_address'].to_numpy()

nn_model = NearestNeighbors(n_neighbors=30, metric='cosine', algorithm='brute', n_jobs=-1)
nn_model.fit(X_train)

batch_size = 2000   # daha küçük batch

preds = []

for start in range(0, X_test.shape[0], batch_size):
    end = min(start + batch_size, X_test.shape[0])
    
    distances, indices = nn_model.kneighbors(X_test[start:end])
    
    for j in range(indices.shape[0]):
        test_idx = start + j
        best_local_indices = indices[j]
        test_address_text = test_normalized_arr[test_idx]
        
        best_score = -1
        best_label = None
        
        for k, cand_idx in enumerate(best_local_indices):
            cand_text = train_normalized_arr[cand_idx]
            
            tfidf_score = 1 - distances[j, k]

            fuzz_sort = fuzz.token_sort_ratio(test_address_text, cand_text) / 100
            fuzz_set = fuzz.token_set_ratio(test_address_text, cand_text) / 100
            fuzz_score = (fuzz_sort * 0.4) + (fuzz_set * 0.6)

            hybrid_score = (tfidf_score * 0.6) + (fuzz_score * 0.4)
            
            if hybrid_score > best_score:
                best_score = hybrid_score
                best_label = labels[cand_idx]
        
        preds.append(best_label)

submission = pd.DataFrame({"id": test["id"], "label": preds})
submission.to_csv("submission.csv", index=False)


Collecting thefuzz
  Downloading thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━