### **Libraries:**

In [14]:
!pip install plot-metric
!pip install xgboost
!pip install TurkishStemmer
!pip install nltk
!pip install xlwt
!pip install sklearn-genetic-opt
!pip install sklearn-genetic-opt[all]

from sklearn import metrics, tree, svm, preprocessing
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from TurkishStemmer import TurkishStemmer
from IPython.display import display, HTML
from sklearn_genetic import GASearchCV, ExponentialAdapter
from sklearn_genetic.space import Continuous, Categorical, Integer
from sklearn_genetic.plots import plot_fitness_evolution
import pandas as pd
import numpy as np
import os
import re
import nltk
import pickle
import openpyxl
import statistics
import matplotlib
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from datetime import datetime
from pytz import timezone
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt
import seaborn as sns

folder = ''



### **Functions:**

In [34]:
veriseti="haberler"

#En çok hangi kelimelerden ötürü ilgili kategoriye karar verdiğini gösterir
def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
    labelid = list(classifier.classes_).index(classlabel)
    feature_names = vectorizer.get_feature_names_out()
    topn = sorted(zip(classifier.coef_[labelid], feature_names))[-n:]

    for coef, feat in topn:
        print(classlabel, feat, coef)

#Veri için önhazırlık aşaması
def prepare_data(data,text_column=0, add_two_grams=False):
    nltk.download('stopwords')
    stemmer = TurkishStemmer()

    corpus = []
    i = 0
    for i in range(0, len(data)):
        try:
            x = data.iloc[i][text_column]
            sentence = re.sub("\W+", " ", x)
            sentence = sentence.replace("İ","i")
            sentence = sentence.replace("I","ı")
            sentence = sentence.lower()
            word_list = sentence.split()
            stemmed_word_list = []

            prevWord = "" #two_grams için

            for word in word_list:
                if not word in stopwords.words('turkish'):
                    if word.isalpha():
                        try:
                          stemmed_word_list.append(stemmer.stem(word))

                          if add_two_grams:
                            phrase = prevWord + word
                            stemmed_word_list.append(stemmer.stem(phrase))
                            prevWord = word

                        except Exception as ex2:
                          pass

            new_sentence = re.sub("\W+"," ",re.sub(r'\b[a-zA-Z]{1}\b', " ", re.sub("\d", "", " ".join(stemmed_word_list))))
            corpus.append(new_sentence)
        except Exception  as ex:
            x = data.iloc[i][text_column]
            corpus.append(" ")
            print("err docid:"+str(i)+":"+str(x)+"---")
            print(ex)
    return(corpus)

#Verilen verisetiyle ve input dosyasıyla ister model_adi'ndan dosya çağırarak, ister model'in ve CountVectorizer'ın kendisini kullanarak sonuç raporu oluştur
def sonuc_hazirla(veriseti, test_dosyasi_adi, model_adi="", model=None):
  input_data = pd.read_excel(test_dosyasi_adi)
  cv = pickle.load(open("ai_vectorizer_" + veriseti + ".pkl", 'rb'))

  if model is None:
    model = pickle.load(open("ai_model_" + veriseti + "_" + model_adi + ".pkl", 'rb'))

  input_data_corpus = prepare_data(input_data, add_two_grams=False)
  cv_test = CountVectorizer(max_features=10000,vocabulary=cv.get_feature_names_out())
  X_test_gercek = cv_test.fit_transform(input_data_corpus)

  if model_adi == "XGBClassifier":
    label_encoder = pickle.load(open("ai_labelencoder_" + veriseti + "_XGBClassifier.pkl", 'rb'))
    predictions = model.predict(X_test_gercek)
    predictions_decoded = label_encoder.inverse_transform(predictions)
    predictions_df = pd.DataFrame(predictions_decoded, columns=['Predictions'])
  else:
    predictions = model.predict(X_test_gercek)
    predictions_df = pd.DataFrame(predictions, columns=['Predictions'])

  rapor = pd.concat([input_data, predictions_df], axis=1)

  return(rapor)

#Corpus oluşturur, modeli çalıştırır, cross validation yapar, hiperparametre araması yapar
def model_build(veriseti, model, altveriseti='data', prepare_corpus=False, add_two_grams=False, build_model=True, make_crossval=False, folds_for_cv=5, make_genetic_opt=False, make_gridsearch_opt=False, make_randomized_opt=False, param_grid=None):
  model_adi = model.__class__.__name__
  training_data = pd.read_excel(folder + "/training_data_" + veriseti + ".xlsx", sheet_name=altveriseti)
  training_data.columns = ['baslik', 'etiket']
  sinif_sayisi = training_data.etiket.unique().size #Binary mi multi-class mı?
  satir_sayisi = training_data.baslik.size

  print("---------------- " + veriseti + " - " + altveriseti + " - " + model_adi + " - satır sayısı:" + str(satir_sayisi) + " ----------------")
  start_time = datetime.now(timezone('Europe/Istanbul'))
  print("Start Time = " + start_time.strftime("%H:%M:%S"))

  data_corpus = None

  if prepare_corpus: #data_corpus'un hazırlanması istenmiş mi?
    data_corpus = prepare_data(training_data, add_two_grams=add_two_grams) #data_corpus'u hazırla
    pickle.dump(data_corpus, open(folder + "/data_corpus_" + veriseti + ".pkl","wb")) #data_corpusu kaydet
  else:
    data_corpus = pickle.load(open(folder + "/data_corpus_" + veriseti + ".pkl", 'rb')) #data_corpus'u dosyadan çağır

  if build_model: #modelin eğitilmesi istenmiş mi?
    cv = CountVectorizer(max_features=10000)
    X = cv.fit_transform(data_corpus).toarray()
    pickle.dump(cv, open(folder + "/ai_vectorizer_" + veriseti + ".pkl","wb")) #CountVectorizer'ı pickle'a kaydet (sadece cv.get_feature_names_out()'u çağırmak için gerekli)

    y = training_data.etiket

    if model_adi == "XGBClassifier":
      # XGBClassifier'da y sütunu sayısal değer olarak kabul edildiğinden encode etmek icap ediyor
      label_encoder = preprocessing.LabelEncoder().fit(y)
      pickle.dump(label_encoder, open(folder + "/ai_labelencoder_" + veriseti + "_" + model_adi + ".pkl","wb")) #LabelEncoder'ı pickle'a kaydet. XGBClassifier kullanılırsa çağırmak gerekecek.
      y = label_encoder.transform(y)

    if make_genetic_opt: #Genetik algoritma kullanarak optimal parametreleri bul
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30, stratify=y)
      skf = StratifiedKFold(n_splits=folds_for_cv, shuffle=True)
      evolved_estimator = GASearchCV(estimator=model, cv=skf, scoring='accuracy', population_size=6, generations=8, param_grid=param_grid, n_jobs=-1, verbose=True)
      evolved_estimator.fit(X_train, y_train)
      y_predict = evolved_estimator.predict(X_test)
      acc = accuracy_score(y_test, y_predict)

      print("Best accuracy through Genetic Opt: ", acc)
      print("Best parameters: ", evolved_estimator.best_params_)

      pickle.dump(evolved_estimator.estimator, open(folder + "/ai_model_" + veriseti + "_" + model_adi + "_geneticOpt.pkl", "wb")) #Optimize edilen modeli pickle'a kaydet

      model = evolved_estimator
    elif make_gridsearch_opt: #GridSearchCV kullanarak optimal parametreleri bul
      skf = StratifiedKFold(n_splits=folds_for_cv)
      gridsearched_estimator = GridSearchCV(model, param_grid, n_jobs=-1, cv=skf, verbose=2).fit(X, y)

      print('Best accuracy through GridSearch : {:.3f}'.format(gridsearched_estimator.best_score_))
      print('Best parameters : {}\n'.format(gridsearched_estimator.best_params_))

      pickle.dump(gridsearched_estimator.estimator, open(folder + "/ai_model_" + veriseti + "_" + model_adi + "_gridsearchOpt.pkl", "wb")) #Optimize edilen modeli pickle'a kaydet

      model = gridsearched_estimator
    elif make_randomized_opt: #RandomizedSearchCV kullanarak optimal parametreleri bul
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30, stratify=y)
      skf = StratifiedKFold(n_splits=folds_for_cv)
      randomized_estimator = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=skf, n_iter=3, n_jobs=-1, verbose=3)
      randomized_estimator.fit(X_train, y_train)
      y_predict = randomized_estimator.predict(X_test)
      accuracy = accuracy_score(y_test, y_predict)

      print("The best parameters are: ", randomized_estimator.best_params_)
      print("The best score: ", randomized_estimator.best_score_)
      print(f"The test accuracy using best parameters: {accuracy:.2f}")

      pickle.dump(randomized_estimator.estimator, open(folder + "/ai_model_" + veriseti + "_" + model_adi + "_randomizedsearchOpt.pkl", "wb")) #Optimize edilen modeli pickle'a kaydet
      model = randomized_estimator
    else: #Sadece tekil bir model eğit
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 20, stratify=y)
      model.fit(X_train, y_train)
      pickle.dump(model, open(folder + "/ai_model_" + veriseti + "_" + model_adi + ".pkl", "wb")) #Modeli pickle'a kaydet

      y_pred = model.predict(X_test)
      y_pred_train = model.predict(X_train)

      train_accuracy = accuracy_score(y_train, y_pred_train)
      test_accuracy = accuracy_score(y_test, y_pred)
      precision = precision_score(y_test, y_pred, average='weighted')
      recall = recall_score(y_test, y_pred, average='weighted')
      f1score = f1_score(y_test, y_pred, average='weighted')

      probs = model.predict_proba(X_test)

      roc_auc = 0
      if sinif_sayisi == 2:
        roc_auc = roc_auc_score(y_test, probs[:,1], average = 'weighted') #Binary classification ise probs 2d array geliyor
      else:
        roc_auc = roc_auc_score(y_test, probs, multi_class='ovo', average = 'weighted') #Multi-class classification ise probs 1d array geliyor

      #print(classification_report(y_test,y_pred)) #sınıf sınıf çok fazla gereksiz bilgi seli
      print("Train Accuracy = %0.3f" % train_accuracy)
      print("Test Accuracy = %0.3f" % test_accuracy)
      print("Precision = %0.3f" % precision)
      print("Recall = %0.3f" % recall)
      print("F1 Score = %0.3f" % f1score)
      print("Roc_auc Score = %0.3f" % roc_auc)

      if make_crossval: #Cross validation yap (folds_for_cv: Kaç parçaya bölüneceğini belirtir)
        skf = StratifiedKFold(n_splits=folds_for_cv)
        scores = cross_val_score(model, X, y, cv=skf)
        print(folds_for_cv, "-fold Stratified Cross Validation sonucu Accuracy ve Standart Sapma: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

      cf_matrix = confusion_matrix(y_test, y_pred)
      categories = training_data.etiket.unique()
      make_confusion_matrix(cf_matrix, model_adi, veriseti, categories=categories, accuracy = test_accuracy)

  end_time = datetime.now(timezone('Europe/Istanbul'))
  delta = end_time - start_time
  sec = delta.total_seconds()
  min = sec / 60

  print("End Time = " + end_time.strftime("%H:%M:%S"))
  print("Running Time in Minutes: {:.3f}".format(min))
  print("----------------------------------------------------------------")

  return model

#Detaylı bilgi için: https://github.com/DTrimarchi10/confusion_matrix/blob/master/cf_matrix.py
def make_confusion_matrix(cf,
                          model_name,
                          veriseti,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          accuracy=-1):

    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])

    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations

        if accuracy == -1:
          accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""

    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False

    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('Doğru Etiketler')
        plt.xlabel('Tahmin Edilen Etiketler' + stats_text)
    else:
        plt.xlabel(stats_text)

    plt.title("".join([model_name, " - ", veriseti]))

#Bir veriseti için tüm sınıflandırma algoritmalarını çalıştırır
def train_all_models(veriseti, altveriseti='data', prepare_corpus=False, make_crossval=False, folds_for_cv=5):
  classifier_collection = {}

  print("AKTİF VERİSETİ: " + veriseti + "-" + altveriseti)

  classifier_collection['MultinomialNB'] = model_build(veriseti, MultinomialNB(), altveriseti=altveriseti, prepare_corpus=prepare_corpus, make_crossval=make_crossval, folds_for_cv=folds_for_cv)
  classifier_collection['GaussianNB'] = model_build(veriseti, GaussianNB(), altveriseti=altveriseti, make_crossval=make_crossval, folds_for_cv=folds_for_cv)
  classifier_collection['BernoulliNB'] = model_build(veriseti, BernoulliNB(), altveriseti=altveriseti, make_crossval=make_crossval, folds_for_cv=folds_for_cv)
  classifier_collection['KNeighborsClassifier'] = model_build(veriseti, KNeighborsClassifier(), altveriseti=altveriseti, make_crossval=make_crossval, folds_for_cv=folds_for_cv)
  classifier_collection['LogisticRegression'] = model_build(veriseti, LogisticRegression(random_state=0, class_weight='balanced'), altveriseti=altveriseti, make_crossval=make_crossval, folds_for_cv=folds_for_cv)
  classifier_collection['DecisionTreeClassifier'] = model_build(veriseti, tree.DecisionTreeClassifier(), altveriseti=altveriseti, make_crossval=make_crossval, folds_for_cv=folds_for_cv)
  classifier_collection['RandomForestClassifier'] = model_build(veriseti, RandomForestClassifier(), altveriseti=altveriseti, make_crossval=make_crossval, folds_for_cv=folds_for_cv)
  #classifier_collection['SVC'] = model_build(veriseti, svm.SVC(kernel='sigmoid', decision_function_shape = 'ovo', probability=True), altveriseti=altveriseti, make_crossval=make_crossval, folds_for_cv=folds_for_cv)   #SVC hem gereksiz uzun sürüyor hem de genellikle accuracy score'u kötü
  classifier_collection['XGBClassifier'] = model_build(veriseti, XGBClassifier(tree_method='gpu_hist'), altveriseti=altveriseti, make_crossval=make_crossval, folds_for_cv=folds_for_cv)

  return classifier_collection

### **Model Training and Hyperparameter Tuning:**

In [None]:
#Tüm modelleri train et
train_all_models(veriseti, altveriseti='data', prepare_corpus=False)
print("----")

In [None]:
#XGBClassifier için Hyperparameter Tuning

#RandomizedSearch Opt.
params_for_random = {'tree_method' :['gpu_hist'], 'min_child_weight': sp_randInt(1, 40), 'gamma': sp_randFloat(0.1, 10), 'subsample': sp_randFloat(0.1, 1), 'colsample_bytree': sp_randFloat(0.1, 1), 'max_depth': sp_randInt(2, 20)}
random_model = model_build(veriseti, XGBClassifier(tree_method='gpu_hist'), altveriseti='data', make_randomized_opt=True, folds_for_cv=3, param_grid=params_for_random)
print("RandomizedSearchCV ile bulunan optimum hiperparametrelerle cross validation: ")
the_model=model_build(veriseti, XGBClassifier(**random_model.best_params_), altveriseti='data', make_crossval=True, folds_for_cv=5)

#Genetik Opt.
params_for_genetic = {'tree_method': Categorical(['gpu_hist']), 'min_child_weight': Integer(1, 40), 'gamma': Continuous(0.1, 10, distribution='log-uniform'), 'subsample': Continuous(0.1, 1, distribution='log-uniform'), 'colsample_bytree': Continuous(0.1, 1, distribution='log-uniform'), 'max_depth': Integer(2, 20)}
genetic_model = model_build(veriseti, XGBClassifier(tree_method='gpu_hist'), altveriseti='data', make_genetic_opt=True, folds_for_cv=5, param_grid=params_for_genetic)
print("GASearchCV ile bulunan optimum hiperparametrelerle cross validation: ")
the_model=model_build(veriseti, XGBClassifier(**genetic_model.best_params_), altveriseti='data', make_crossval=True, folds_for_cv=5)

#GridSearch Opt.
params_for_grid = {'tree_method' :['gpu_hist'], 'min_child_weight': [1, 5, 10, 40], 'gamma': [0.5, 1, 1.5, 2, 5, 10], 'subsample': [0.1, 0.5, 1.0], 'colsample_bytree': [0.1, 0.5, 1.0], 'max_depth': [2, 5, 10, 20]}
grid_model = model_build(veriseti, XGBClassifier(), altveriseti='data', make_gridsearch_opt=True, folds_for_cv=5, param_grid=params_for_grid)
print("GridSearchCV ile bulunan optimum hiperparametrelerle cross validation: ")
the_model=model_build(veriseti, XGBClassifier(**grid_model.best_params_), altveriseti='data', make_crossval=True, folds_for_cv=5)

In [None]:
#RandomForestClassifier için Hyperparameter Tuning

#RandomizedSearch Opt.
params_for_random = {'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000, 1200]}
random_model = model_build(veriseti, RandomForestClassifier(), altveriseti='data', make_randomized_opt=True, folds_for_cv=5, param_grid=params_for_random)
print("RandomizedSearchCV ile bulunan optimum hiperparametrelerle cross validation: ")
the_model=model_build(veriseti, RandomForestClassifier(**random_model.best_params_), altveriseti='data', make_crossval=True, folds_for_cv=5)

#Genetik Opt.
params_for_genetic = {'n_estimators': Integer(200, 500), 'max_features': Categorical(['auto', 'sqrt', 'log2']), 'max_depth' : Integer(4, 8), 'criterion' :Categorical(['gini', 'entropy'])}
genetic_model = model_build(veriseti, RandomForestClassifier(), altveriseti='data', make_genetic_opt=True, folds_for_cv=2, param_grid=params_for_genetic)
print("GASearchCV ile bulunan optimum hiperparametrelerle cross validation: ")
the_model=model_build(veriseti, RandomForestClassifier(**genetic_model.best_params_), altveriseti='data', make_crossval=True, folds_for_cv=5)

#GridSearch Opt.
params_for_grid = {'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy']}
grid_model = model_build(veriseti, RandomForestClassifier(), altveriseti='data', make_gridsearch_opt=True, folds_for_cv=2, param_grid=params_for_grid)
print("GridSearchCV ile bulunan optimum hiperparametrelerle cross validation: ")
the_model=model_build(veriseti, RandomForestClassifier(**grid_model.best_params_), altveriseti='data', make_crossval=True, folds_for_cv=5)

### **Prediction:**

In [37]:
#Verilen input dosyasını, seçilen verisetini ve seçilen modeli kullanarak sonuç dosyasını hazırla
veriseti = "haberler"
input_dosyasi_adi = "test_data_" + veriseti + ".xlsx"
model_adi = "XGBClassifier"
rapor = sonuc_hazirla(veriseti, input_dosyasi_adi, model_adi) #bu fonksiyon, model_adi yerine eğitilmiş modelin kendisini de parametre olarak kabul eder (Ör: model=model_collection['MultinomialNB']). model_adi veririlirse modeli dosyadan bulur.
rapor.to_excel("output_" + veriseti + ".xlsx")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
