In [1]:
!pip install nltk
!pip install openpyxl

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (781 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m781.7/781.7 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: regex, nltk
Successfully installed nltk-3.9.1 regex-2024.11.6
Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.

### NLTK Modülü Kullanarak "Stop Word"('ve', 'ile', 'gibi', ...) Olan Kelimeleri Belirleme

In [2]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('turkish'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Veri Ön İşleme
+ Metni Küçük Harflere Çevirme
+ Noktalama İşaretleri, Rakamlar ve Sembolleri Çıkarma
+ Stop Word Olan Kelimeleri Çıkarma

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = ''.join([c for c in text if c.isalpha() or c.isspace()])
    words = text.split()
    words = [w for w in words if not w in stop_words]
    return ' '.join(words)

### Veri Setini Pandas DataFrame'ine Yükleme

In [4]:
import pandas as pd

data = pd.read_excel('gazete.xlsx')
data['cleaned_text'] = data['icerik'].apply(preprocess_text)

### Eğitim ve Test Verisini Ayırma

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['kategori'], stratify=data['kategori'])

### TF-IDF Vectorizer ile Haber Başlıklarını Sayısal Vektörlere Dönüştürme

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

### Multinomial Naive Bayes Algoritması ile Modeli Eğitme

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

### Test Verisi ile Model Metriklerinin Ölçülmesi

In [8]:
y_pred = model.predict(X_test_tfidf)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8666666666666667
              precision    recall  f1-score   support

       dunya       0.81      0.88      0.84       258
kultur-sanat       0.85      0.80      0.82       226
     magazin       0.84      0.89      0.86       275
     siyaset       0.85      0.90      0.87       255
        spor       0.98      0.91      0.94       222
   teknoloji       0.95      0.78      0.86       159

    accuracy                           0.87      1395
   macro avg       0.88      0.86      0.87      1395
weighted avg       0.87      0.87      0.87      1395



### Örnek Tahminde Bulunma

In [9]:
new_text = "Hadise isyan etti: Beni asla alaşağı edemezsin"
cleaned_text = preprocess_text(new_text)
new_text_tfidf = vectorizer.transform([cleaned_text])
prediction = model.predict(new_text_tfidf)
print("Tahmin edilen etiket:", prediction[0])

Tahmin edilen etiket: magazin


### Modeli Kaydetme

In [10]:
import joblib

joblib.dump(model, 'NB_topic_classification_model.pkl')
joblib.dump(vectorizer, 'tf_idf_vectorizer.pkl')

['tf_idf_vectorizer.pkl']

### Modeli Google Bucket Servisine Kaydetme

In [12]:
from google.cloud import storage

client = storage.Client()

bucket = client.bucket('ml-project-bucket2341')
bucket.blob('model/NB_topic_classification_model.pkl').upload_from_filename('NB_topic_classification_model.pkl')
bucket.blob('model/tf_idf_vectorizer.pkl').upload_from_filename('tf_idf_vectorizer.pkl')