In [None]:
%%time

!pip install plot-metric
!pip install xgboost
!pip install TurkishStemmer
!pip install nltk
!pip install xlwt

In [None]:
import warnings
warnings.filterwarnings("ignore")

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV  
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from TurkishStemmer import TurkishStemmer
from IPython.display import display, HTML
import pandas as pd
import numpy as np
import os
import re
import nltk
import pickle
import openpyxl

from google.colab import drive
drive.mount('/content/gdrive')

nltk.download('stopwords')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
CPU times: user 79.8 ms, sys: 7.97 ms, total: 87.7 ms
Wall time: 2.78 s


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
%%time

stemmer = TurkishStemmer()
classifier_collection = {}
training_data = pd.read_excel("/content/gdrive/MyDrive/Colab Notebooks/proje_santrifuj/training_data.xlsx")
etiketler = training_data.etiket.drop_duplicates().to_numpy()

def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
    labelid = list(classifier.classes_).index(classlabel)
    feature_names = vectorizer.get_feature_names_out()
    topn = sorted(zip(classifier.coef_[labelid], feature_names))[-n:]

    for coef, feat in topn:
        print(classlabel, feat, coef)

def prepare_data(data,text_column=0):
    corpus = []
    i = 0
    for i in range(0, len(data)):
        try:
            x = data.iloc[i][text_column]
            sentence = re.sub("\W+", " ", x)
            sentence = sentence.replace("İ","i")
            sentence = sentence.replace("I","ı")
            sentence = sentence.lower()
            sentence = sentence.replace("projesi","")
            sentence = sentence.replace("ar-ge","")
            sentence = sentence.replace("arge","")
            sentence = sentence.replace("teknoloji","")
            sentence = sentence.replace("araştırılması","")
            sentence = sentence.replace("geliştirilmesi","")
            word_list = sentence.split()
            stemmed_word_list = []
            
            for word in word_list:
                if not word in stopwords.words('turkish'):
                    if word.isalpha():
                        try:
                            stemmed_word_list.append(stemmer.stem(word))
                        except Exception as ex2:
                            pass

            new_sentence = re.sub("\W+"," ",re.sub(r'\b[a-zA-Z]{1}\b', " ", re.sub("\d", "", " ".join(stemmed_word_list))))
            corpus.append(new_sentence)
        except Exception  as ex:
            x = data.iloc[i][text_column]
            corpus.append(" ")
            print("err docid:"+str(i)+":"+str(x)+"---")
            print(ex)
    return(corpus)

def sonuc_hazirla(test_data, cv, model):
    test_data_corpus = prepare_data(test_data, text_column=0)
    cv_test = CountVectorizer(max_features=10000,vocabulary=cv.get_feature_names_out())
    X_test_gercek = cv_test.fit_transform(test_data_corpus)
    ongoruler = pd.DataFrame(model.predict(X_test_gercek))

    yuzde = pd.DataFrame(model.predict_proba(X_test_gercek))
    yuzde.columns = model.classes_
    yuzdeT=yuzde.T

    ongoru1, ongoru2, ongoru3 = [], [], []

    for i in range(0, len(yuzde)):
      yuzdeT = yuzdeT.sort_values(by=[i], ascending=False)
      ongoru1.append(yuzdeT.index[0])
      ongoru2.append(yuzdeT.index[1])
      ongoru3.append(yuzdeT.index[2])

    list_tuples = list(zip(ongoru1, ongoru2, ongoru3))  
    dframe = pd.DataFrame(list_tuples, columns=['Alan 1', 'Alan 2', 'Alan 3'])
    rapor = pd.concat([test_data, dframe], axis=1)

    return(rapor)

CPU times: user 930 ms, sys: 35.8 ms, total: 966 ms
Wall time: 1.81 s


In [None]:
%%time

#Veri temizleme ve data_corpus'u oluşturma
data_corpus = prepare_data(training_data)
pickle.dump(data_corpus, open("/content/gdrive/MyDrive/Colab Notebooks/proje_santrifuj/data_corpus.pkl","wb"))

KeyboardInterrupt: ignored

In [None]:
%%time

#data_corpus'u dosyadan çağırma
data_corpus = pickle.load(open("/content/gdrive/MyDrive/Colab Notebooks/proje_santrifuj/data_corpus.pkl", 'rb'))

CPU times: user 63.8 ms, sys: 6.03 ms, total: 69.8 ms
Wall time: 105 ms


In [None]:
%%time

# Bag-of-Words öznitelik oluşturma
cv = CountVectorizer(max_features=10000)
X = cv.fit_transform(data_corpus).toarray()
y = training_data.etiket
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state = 20)

CPU times: user 2.26 s, sys: 2.35 s, total: 4.61 s
Wall time: 4.73 s


In [None]:
%%time

# Multinomial Naive Bayes modeli
from sklearn.naive_bayes import MultinomialNB
classifier_collection['multinb_bow']=MultinomialNB(alpha = 0.7, fit_prior=True)
classifier_collection['multinb_bow'].fit(X_train, y_train)
y_pred = classifier_collection['multinb_bow'].predict(X_test)
y_pred_test = classifier_collection['multinb_bow'].predict(X_train)
print("Test Accuracy Score : {:.2f}".format(accuracy_score(y_test, y_pred)))
print("Train Accuracy Score : {:.2f}".format(accuracy_score(y_train, y_pred_test)))
#print(classification_report(y_test,y_pred))

selected_model = classifier_collection['multinb_bow']

KeyboardInterrupt: ignored

In [None]:
%%time

# Gaussian Naive Bayes modeli
from sklearn.naive_bayes import GaussianNB
classifier_collection['gaussiannb_bow']=GaussianNB()
classifier_collection['gaussiannb_bow'].fit(X_train, y_train)
y_pred = classifier_collection['gaussiannb_bow'].predict(X_test)
y_pred_test = classifier_collection['gaussiannb_bow'].predict(X_train)
print("Test Accuracy Score : {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Train Accuracy Score : {:.3f}".format(accuracy_score(y_train, y_pred_test)))
#print(classification_report(y_test,y_pred))

Test Accuracy Score : 0.446
Train Accuracy Score : 0.944
CPU times: user 20.9 s, sys: 13.6 s, total: 34.5 s
Wall time: 34.6 s


In [None]:
%%time

# Bernoulli Naive Bayes modeli
from sklearn.naive_bayes import BernoulliNB
classifier_collection['bernoullinb_bow']=BernoulliNB()
classifier_collection['bernoullinb_bow'].fit(X_train, y_train)
y_pred = classifier_collection['bernoullinb_bow'].predict(X_test)
y_pred_test = classifier_collection['bernoullinb_bow'].predict(X_train)
print("Test Accuracy Score : {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Train Accuracy Score : {:.3f}".format(accuracy_score(y_train, y_pred_test)))
#print(classification_report(y_test,y_pred))

Test Accuracy Score : 0.463
Train Accuracy Score : 0.631
CPU times: user 41.9 s, sys: 507 ms, total: 42.4 s
Wall time: 42 s


In [None]:
%%time

# SVM modeli
from sklearn import svm
classifier_collection['svm_bow']=svm.SVC(kernel='sigmoid')
classifier_collection['svm_bow'].fit(X_train, y_train)
y_pred = classifier_collection['svm_bow'].predict(X_test)
y_pred_test = classifier_collection['svm_bow'].predict(X_train)
print("Test Accuracy Score : {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Train Accuracy Score : {:.3f}".format(accuracy_score(y_train, y_pred_test)))
#print(classification_report(y_test,y_pred))

Test Accuracy Score : 0.584
Train Accuracy Score : 0.717
CPU times: user 7min 39s, sys: 562 ms, total: 7min 40s
Wall time: 7min 40s


In [None]:
%%time

# Logistic Regression modeli
from sklearn.linear_model import LogisticRegression
classifier_collection['logreg_bow']=LogisticRegression(random_state=0)
classifier_collection['logreg_bow'].fit(X_train, y_train)
y_pred = classifier_collection['logreg_bow'].predict(X_test)
y_pred_test = classifier_collection['logreg_bow'].predict(X_train)
print("Test Accuracy Score : {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Train Accuracy Score : {:.3f}".format(accuracy_score(y_train, y_pred_test)))
#print(classification_report(y_test,y_pred))

Test Accuracy Score : 0.585
Train Accuracy Score : 0.951
CPU times: user 2min 30s, sys: 27.3 s, total: 2min 58s
Wall time: 1min 45s


In [None]:
%%time

# DecisionTree modeli
from sklearn import tree
classifier_collection['dectree_bow'] = tree.DecisionTreeClassifier()
classifier_collection['dectree_bow'].fit(X_train, y_train)
y_pred = classifier_collection['dectree_bow'].predict(X_test)
y_pred_test = classifier_collection['dectree_bow'].predict(X_train)
print("Test Accuracy Score : {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Train Accuracy Score : {:.3f}".format(accuracy_score(y_train, y_pred_test)))
#print(classification_report(y_test,y_pred))

Test Accuracy Score : 0.335
Train Accuracy Score : 0.951
CPU times: user 20.5 s, sys: 222 ms, total: 20.7 s
Wall time: 20.8 s


In [None]:
%%time

# RandomForest modeli
from sklearn.ensemble import RandomForestClassifier
classifier_collection['randomforest_bow'] = RandomForestClassifier(n_estimators=80)
classifier_collection['randomforest_bow'].fit(X_train, y_train)
y_pred = classifier_collection['randomforest_bow'].predict(X_test)
y_pred_test = classifier_collection['randomforest_bow'].predict(X_train)
print("Test Accuracy Score : {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Train Accuracy Score : {:.3f}".format(accuracy_score(y_train, y_pred_test)))
#print(classification_report(y_test,y_pred))

Test Accuracy Score : 0.533
Train Accuracy Score : 0.951
CPU times: user 22.4 s, sys: 216 ms, total: 22.7 s
Wall time: 22.8 s


In [None]:
%%time

# XGBoost modeli
from xgboost import XGBClassifier
classifier_collection['xgboost_bow'] = XGBClassifier(enable_categorical=True)
classifier_collection['xgboost_bow'].fit(X_train, y_train)
y_pred = classifier_collection['xgboost_bow'].predict(X_test)
y_pred_test = classifier_collection['xgboost_bow'].predict(X_train)
print("Test Accuracy Score : {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Train Accuracy Score : {:.3f}".format(accuracy_score(y_train, y_pred_test)))
#print(classification_report(y_test,y_pred))

In [None]:
%%time

# Cross Validation (k-fold)
scores = cross_val_score(selected_model, X, y, cv=10)
print("acc score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
#print(scores)

acc score: 0.59 (+/- 0.04)
CPU times: user 6min 32s, sys: 2.73 s, total: 6min 35s
Wall time: 6min 35s


In [None]:
%%time

#GridSearch ile hiperparametre optimizasyonu
parameters = {'alpha': [0.01, 0.1, 0.5, 0.7, 1.0, 10.0, ],
              'fit_prior': [True, False],
            }
gs_clf = GridSearchCV(MultinomialNB(), parameters, n_jobs=-1, cv=5, verbose=5)  
gs_clf = gs_clf.fit(X, y)
print('Best Accuracy Through Grid Search : {:.3f}'.format(gs_clf.best_score_))
print('Best Parameters : {}\n'.format(gs_clf.best_params_))

In [None]:
%%time

# Modeli pickle dosyasına kaydet
pickle.dump(selected_model, open("/content/gdrive/MyDrive/Colab Notebooks/proje_santrifuj/ai_model.pkl", "wb"))
pickle.dump(cv,open("/content/gdrive/MyDrive/Colab Notebooks/proje_santrifuj/ai_vectorizer.pkl","wb"))

CPU times: user 109 ms, sys: 32.9 ms, total: 142 ms
Wall time: 210 ms


In [None]:
%%time
# Modeli pickle dosyasından çağır
selected_model = pickle.load(open("/content/gdrive/MyDrive/Colab Notebooks/proje_santrifuj/ai_model.pkl", 'rb'))
cv = pickle.load(open("/content/gdrive/MyDrive/Colab Notebooks/proje_santrifuj/ai_vectorizer.pkl", 'rb'))
test_data = pd.read_excel("/content/gdrive/MyDrive/Colab Notebooks/proje_santrifuj/test_data.xlsx")

sonuc_raporu = sonuc_hazirla(test_data, cv, selected_model)
sonuc_raporu.to_excel("/content/gdrive/MyDrive/Colab Notebooks/proje_santrifuj/ongoruler.xlsx")

CPU times: user 28.1 s, sys: 3.11 s, total: 31.2 s
Wall time: 31.9 s


In [None]:
#most_informative_feature_for_class(cv,selected_model,"Yapay Zeka",n=100)