In [1]:
import pandas as pd 
import numpy as np 
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df1 = pd.read_csv("bbc.csv")
df2 = pd.read_csv("diken.csv")
df3 = pd.read_csv("yenicag.csv")
df4 = pd.read_csv("odatv.csv")

##Tüm farklı haber sitelerinden çekilen veriler birleştirildi.

data= pd.concat([df1, df2, df3, df4], ignore_index=True)
data = data.sample(frac=1).reset_index(drop=True)
data.to_csv("Haberler.csv", index=False)

In [3]:
#Haberler küçük harflere çevrildi.
data["haber_basligi"]=data["haber_basligi"].apply(lambda comment:comment.lower())

In [4]:
#Noktalamalar atıldı.
def noktalama_at(haber):
    temiz_haber= []
    for char in haber:
        if char not in string.punctuation and char!="’":
            temiz_haber.append(char)
        elif char=="'" or char=="’":
            temiz_haber.append(" ")
    return "".join(temiz_haber)


In [5]:
data["haber_basligi"]=data["haber_basligi"].apply(noktalama_at)

In [6]:
##Stopwordlere ekleme yapıldı.
turkce_stopwords=stopwords.words('turkish')
turkce_stopwords.extend(["nda","nde","u","mi","ni","nu","nin","nın","nun","in","ın","un","ye","a","e","ı","i","den","ten","dan","tan","ta"])

In [None]:
turkce_stopwords

In [8]:
##Stopwords ve kesme işareti ile ayrılan ekler temizlendi
def stopwords_temizle(haber):
    tokens = nltk.word_tokenize(haber)
    temiz_tokens = [i for i in tokens if i not in turkce_stopwords]
    return " ".join(temiz_tokens)


In [9]:
data["haber_basligi"]=data["haber_basligi"].apply(stopwords_temizle)

In [10]:
#Haber kategorilerinin dağılımı 0:Teknoloji, 1:Ekonomi, 2:Spor
data['label'].value_counts()

label
1    2382
0    2177
2    2160
Name: count, dtype: int64

In [11]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = tfidf.fit_transform(data["haber_basligi"])
y = data["label"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=37)

In [13]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)

In [14]:
##MultinomialNB Model Sonucu
nb_pred = nb_model.predict(X_test)
print("Doğruluk Oranı:", accuracy_score(y_test, nb_pred))
print("Sınıflandırma Raporu:\n", classification_report(y_test, nb_pred))

Doğruluk Oranı: 0.9069940476190477
Sınıflandırma Raporu:
               precision    recall  f1-score   support

           0       0.87      0.91      0.89       434
           1       0.90      0.86      0.88       479
           2       0.94      0.95      0.95       431

    accuracy                           0.91      1344
   macro avg       0.91      0.91      0.91      1344
weighted avg       0.91      0.91      0.91      1344



In [15]:
from sklearn.svm import LinearSVC

In [16]:
#SVM Model Eğitimi
svc_model = LinearSVC()
svc_model.fit(X_train, y_train)

In [17]:
##LinearSVC Model Sonucu
svc_pred = svc_model.predict(X_test)
print("Doğruluk Oranı:", accuracy_score(y_test, svc_pred))
print("Sınıflandırma Raporu:\n", classification_report(y_test, svc_pred))

Doğruluk Oranı: 0.8973214285714286
Sınıflandırma Raporu:
               precision    recall  f1-score   support

           0       0.86      0.89      0.88       434
           1       0.88      0.87      0.87       479
           2       0.96      0.94      0.95       431

    accuracy                           0.90      1344
   macro avg       0.90      0.90      0.90      1344
weighted avg       0.90      0.90      0.90      1344



In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
#LogisticRegression Model Eğitimi
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

In [20]:
##LogisticRegression Model Sonucu
lr_pred = lr_model.predict(X_test)
print("Doğruluk Oranı:", accuracy_score(y_test, lr_pred))
print("Sınıflandırma Raporu:\n", classification_report(y_test, lr_pred))

Doğruluk Oranı: 0.8980654761904762
Sınıflandırma Raporu:
               precision    recall  f1-score   support

           0       0.88      0.89      0.88       434
           1       0.87      0.88      0.87       479
           2       0.95      0.93      0.94       431

    accuracy                           0.90      1344
   macro avg       0.90      0.90      0.90      1344
weighted avg       0.90      0.90      0.90      1344

