In [2]:
'''
Burada sentimentmod modülünün henüz pickling yapılmamış hali bulunur ve bu kod bir kez çalıştırıldığında pickle dosyalarımız kaydolur.

Bir kez çalıştırma sonrasında data setimizde değişiklik yoksa yeniden çalıştırmaya gerek yoktur ve bundan sonra save edilmiş pickle dosyaları üzerinden çalışan sentimentmod kodunu kullanmamız yeterlidir.
'''


import nltk
import random
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize



class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

#positive.txt ve negative.txt doyalarımızı okuyarak iki farklı değişkene atama yaparız
short_pos = open("short_reviews/positive.txt","r",encoding='utf-8',errors='replace').read()
short_neg = open("short_reviews/negative.txt","r",encoding='utf-8',errors='replace').read()

all_words = []
documents = []

allowed_word_types = ["J"] #bu kod satırıyla sadece adjective olan kelimeleri kullanmamızı sağlamak için bir değişken ataması yaparız
for r in short_pos.split('\n'):
    documents.append((r, "pos"))
    words = word_tokenize(r)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())
    
    
    
for r in short_neg.split('\n'):
    documents.append((r, "neg"))
    words = word_tokenize(r)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())
    


#pickle dosyalarını kaydetmeye başlıyoruz   
save_documents = open("short_data_pickled/save_doc.pickle","wb")
pickle.dump(documents, save_documents)
save_documents.close()

    
all_words = nltk.FreqDist(all_words)


word_features = list(all_words.keys())[:5000]

word_features_save = open("short_data_pickled/word_feature_save.pickle",'wb')
pickle.dump(word_features, word_features_save)
word_features_save.close()


def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features


featuresets = [(find_features(rev), category) for (rev, category) in documents]

save_featuresets = open("short_data_pickled/featuresets.pickle","wb")
pickle.dump(featuresets, save_featuresets)
save_featuresets.close()

random.shuffle(featuresets)


training_set = featuresets[:10000]
testing_set = featuresets[10000:]

#kullandığımız her bir classifier için testing sonucu doğruluk oranını da yazdırıyoruz
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy: ",(nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)#burada örnek olarak naivebayes ile test edince bize en çok bilgi veren 15 feature'u da yazdırıyoruz

save_classifier = open("short_data_pickled/short_reviews.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier Algo accuracy: ",(nltk.classify.accuracy(MNB_classifier, testing_set))*100)

save_classifierMNB = open("short_data_pickled/short_reviews_MNB.pickle","wb")
pickle.dump(MNB_classifier, save_classifierMNB)
save_classifierMNB.close()
##Bernoulli
##LogisticRegression, SGDClassifier
##LinearSVC

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("Bernoulli_NB Algo accuracy: ",(nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

save_classifierBernoulli = open("short_data_pickled/short_reviews_Bernoulli.pickle","wb")
pickle.dump(BernoulliNB_classifier, save_classifierBernoulli)
save_classifierBernoulli.close()


LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression Algo accuracy: ",(nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

save_classifierLogistic = open("short_data_pickled/short_reviews_LogisticReg.pickle","wb")
pickle.dump(LogisticRegression_classifier, save_classifierLogistic)
save_classifierLogistic.close()

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC Algo accuracy: ",(nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

save_classifier_LinearSVC = open("short_data_pickled/short_reviews_LinearSVC.pickle","wb")
pickle.dump(LinearSVC_classifier,save_classifier_LinearSVC)
save_classifier_LinearSVC.close()


voted_classifier = VoteClassifier(classifier,MNB_classifier,BernoulliNB_classifier,LogisticRegression_classifier,LinearSVC_classifier)
print("Voted_classifier accuracy:",(nltk.classify.accuracy(voted_classifier, testing_set))*100)


Original Naive Bayes Algo accuracy:  74.54819277108435
Most Informative Features
              engrossing = True              pos : neg    =     21.6 : 1.0
                mediocre = True              neg : pos    =     17.1 : 1.0
                 generic = True              neg : pos    =     16.4 : 1.0
                    loud = True              neg : pos    =     16.4 : 1.0
                  boring = True              neg : pos    =     14.5 : 1.0
                 routine = True              neg : pos    =     14.4 : 1.0
               inventive = True              pos : neg    =     14.3 : 1.0
                    flat = True              neg : pos    =     14.3 : 1.0
              refreshing = True              pos : neg    =     13.0 : 1.0
               wonderful = True              pos : neg    =     13.0 : 1.0
                mindless = True              neg : pos    =     11.7 : 1.0
                delicate = True              pos : neg    =     11.6 : 1.0
               real