# Data Pre-Processing

### 🛠 Installing Necessary Libraries

In [1]:
!pip install pandas nltk beautifulsoup4 TurkishStemmer



In [2]:
import sqlite3
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
from TurkishStemmer import TurkishStemmer

stop_words = set(stopwords.words('turkish'))  # veya 'english' vs.
stemmer = TurkishStemmer()  # Türkçe destekli bir stemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eyupdalan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 🧹 İçeriği Temizleme Fonksiyonu

In [3]:
def clean_html_content(html):
    # HTML etiketlerini kaldır
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text(separator=' ', strip=True)
    
    # Küçük harf, sayılar ve noktalama işaretlerini temizle
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # sayılar
    text = re.sub(r'[^\w\s]', '', text)  # noktalama
    text = re.sub(r'\s+', ' ', text)  # fazlalık boşluklar

    # Stopword çıkarımı ve kök bulma
    tokens = text.split()
    filtered = [stemmer.stem(w) for w in tokens if w not in stop_words]
    
    return ' '.join(filtered)

### 🧪 Örnek Satırları İşleyip Gözlemleyelim

In [4]:
conn = sqlite3.connect('../websites.db')
df = pd.read_sql_query("SELECT id, html FROM pages LIMIT 10", conn)

df['clean_text'] = df['html'].apply(clean_html_content)
df[['id', 'clean_text']].head()

Unnamed: 0,id,clean_text
0,1,zonguldak hırsızlık zan dk bekleyip paspa çalt...
1,2,destination x wer hat gewonnen wer flog im fin...
2,3,boeing unsicherheit drückt den kurs um di voll...
3,4,commerzbank akti news commerzbank wird am mitt...
4,5,muğlaspor transfer sürüyor muğl haber ar takip...


### Tabloya clean_text kolonu ekleme

In [7]:
cursor = conn.cursor()

cursor.execute("ALTER TABLE pages ADD COLUMN clean_text TEXT")
conn.commit()

OperationalError: duplicate column name: clean_text

### 💾 Tüm Veriyi Temizleyip Kaydetmek (Opsiyonel)

In [5]:
cursor = conn.cursor()
df = pd.read_sql_query("SELECT id, html FROM pages", conn)

for idx, row in df.iterrows():
    cleaned = clean_html_content(row['html'])
    cursor.execute("UPDATE pages SET clean_text = ? WHERE id = ?", (cleaned, row['id']))
    if(idx + 1) % 100 == 0:
        print(f"Processed {idx + 1} rows")
        conn.commit()


Processed 100 rows
Processed 200 rows
Processed 300 rows
Processed 400 rows
Processed 500 rows
Processed 600 rows
Processed 700 rows
Processed 800 rows
Processed 900 rows
Processed 1000 rows
Processed 1100 rows
Processed 1200 rows
Processed 1300 rows
Processed 1400 rows
Processed 1500 rows
Processed 1600 rows
Processed 1700 rows
Processed 1800 rows
Processed 1900 rows
Processed 2000 rows
Processed 2100 rows
Processed 2200 rows
Processed 2300 rows
Processed 2400 rows
Processed 2500 rows
Processed 2600 rows
Processed 2700 rows
Processed 2800 rows
Processed 2900 rows
Processed 3000 rows
Processed 3100 rows
Processed 3200 rows
Processed 3300 rows
Processed 3400 rows
Processed 3500 rows
Processed 3600 rows
Processed 3700 rows
Processed 3800 rows
Processed 3900 rows
Processed 4000 rows
Processed 4100 rows
Processed 4200 rows
Processed 4300 rows
Processed 4400 rows
Processed 4500 rows
Processed 4600 rows
Processed 4700 rows
Processed 4800 rows
Processed 4900 rows
Processed 5000 rows
Processed

  k = self.parse_starttag(i)


Processed 8400 rows
Processed 8500 rows
Processed 8600 rows
Processed 8700 rows
Processed 8800 rows
Processed 8900 rows
Processed 9000 rows
Processed 9100 rows
Processed 9200 rows
Processed 9300 rows
Processed 9400 rows
Processed 9500 rows
Processed 9600 rows
Processed 9700 rows
Processed 9800 rows
Processed 9900 rows
Processed 10000 rows
Processed 10100 rows
Processed 10200 rows
Processed 10300 rows
Processed 10400 rows
Processed 10500 rows
Processed 10600 rows
Processed 10700 rows
Processed 10800 rows
Processed 10900 rows
Processed 11000 rows
Processed 11100 rows
Processed 11200 rows
Processed 11300 rows
Processed 11400 rows
Processed 11500 rows
Processed 11600 rows
Processed 11700 rows
Processed 11800 rows
Processed 11900 rows
Processed 12000 rows
Processed 12100 rows
Processed 12200 rows
Processed 12300 rows
Processed 12400 rows
Processed 12500 rows
Processed 12600 rows
Processed 12700 rows
Processed 12800 rows
Processed 12900 rows
Processed 13000 rows
Processed 13100 rows
Processe