## NLP - Sentiment Analysis - Gaussian Naive Bayes Implementation
### Fatih KIYIKÇI

In [987]:
stop_words = set(['a', 'acaba', 'altı', 'ama', 'ancak', 'artık', 'asla', 'aslında', 'az', 'b', 'bana', 'bazen', 'bazı', 'bazıları', 'bazısı', 'belki', 'ben', 'beni', 'benim', 'beş', 'bile', 'bir', 'birçoğu', 'birçok', 'birçokları', 'biri', 'birisi', 'birkaç', 'birkaçı', 'birşey', 'birşeyi', 'biz', 'bize', 'bizi', 'bizim', 'böyle', 'böylece', 'bu', 'buna', 'bunda', 'bundan', 'bunu', 'bunun', 'burada', 'bütün', 'c', 'ç', 'çoğu', 'çoğuna', 'çoğunu', 'çok', 'çünkü', 'd', 'da', 'daha', 'de', 'değil', 'demek', 'diğer', 'diğeri', 'diğerleri', 'diye', 'dokuz', 'dolayı', 'dört', 'e', 'elbette', 'en', 'f', 'fakat', 'falan', 'felan', 'filan', 'g', 'gene', 'gibi', 'ğ', 'h', 'hâlâ', 'hangi', 'hangisi', 'hani', 'hatta', 'hem', 'henüz', 'hep', 'hepsi', 'hepsine', 'hepsini', 'her', 'her biri', 'herkes', 'herkese', 'herkesi', 'hiç', 'hiç kimse', 'hiçbiri', 'hiçbirine', 'hiçbirini', 'ı', 'i', 'için', 'içinde', 'iki', 'ile', 'ise', 'işte', 'j', 'k', 'kaç', 'kadar', 'kendi', 'kendine', 'kendini', 'ki', 'kim', 'kime', 'kimi', 'kimin', 'kimisi', 'l', 'm', 'madem', 'mı', 'mı', 'mi', 'mu', 'mu', 'mü', 'mü', 'n', 'nasıl', 'ne', 'ne kadar', 'ne zaman', 'neden', 'nedir', 'nerde', 'nerede', 'nereden', 'nereye', 'nesi', 'neyse', 'niçin', 'niye', 'o', 'on', 'ona', 'ondan', 'onlar', 'onlara', 'onlardan', 'onların', 'onların', 'onu', 'onun', 'orada', 'oysa', 'oysaki', 'ö', 'öbürü', 'ön', 'önce', 'ötürü', 'öyle', 'p', 'r', 'rağmen', 's', 'sana', 'sekiz', 'sen', 'senden', 'seni', 'senin', 'siz', 'sizden', 'size', 'sizi', 'sizin', 'son', 'sonra', 'ş', 'şayet', 'şey', 'şeyden', 'şeye', 'şeyi', 'şeyler', 'şimdi', 'şöyle', 'şu', 'şuna', 'şunda', 'şundan', 'şunlar', 'şunu', 'şunun', 't', 'tabi', 'tamam', 'tüm', 'tümü', 'u', 'ü', 'üç', 'üzere', 'v', 'var', 've', 'veya', 'veyahut', 'y', 'ya', 'ya da', 'yani', 'yedi', 'yerine', 'yine', 'yoksa', 'z', 'zaten', 'zira'])

In [988]:
import string

def tokenizer(doc: str, stop_words=[]) -> str:
    if stop_words:
        #remove stopwords
        filtered = [w for w in re.findall(r'\w+', doc) if not w in stop_words and len(w) > 2] 
        doc = " ".join(filtered)
    # remove punctuation
    remove_punct = str.maketrans('', '', string.punctuation+'–•’')
    doc = doc.translate(remove_punct)
    #lowercase the letters
    doc = doc.lower()
    #remove digits
    remove_digits = str.maketrans('', '', string.digits)
    doc = doc.translate(remove_digits)
    #tokenize
    filtered = [w for w in doc.split()]
    return filtered

In [989]:
import numpy as np
import time
from collections import defaultdict, Counter
# tfidfVector = np.matrix(tfidfVector)

class TfidfVectorizer():
    def __init__(self, tokenizer, stopwords):
        self.tokenizer = tokenizer
        self.stopwords = stopwords


    def getTF(self, doc: str, tokenize: bool) -> dict:
        if tokenize:
            if self.stopwords:
                doc = self.tokenizer(doc,self.stopwords)
            else:
                doc = self.tokenizer(doc)
        # Counts the number of appearances of every term.
        counter = Counter(doc)

        # Divide the number of appearances of every word to the length of the document(words)
        for word in counter:
            # counter holds tf now
            counter[word] = counter[word] / len(counter)
        return counter

    def getIDF(self, TF: dict):
        counts = defaultdict()
        IDF = defaultdict()
        for doc in TF.values():
            for word in doc:
                if word in counts:
                    counts[word] += 1
                else:
                    counts[word] = 1
        for word in counts:
            IDF[word] = np.log((1 + len(corpus)) / (1 + counts[word])) + 1
        # Create a list of unique words
        self.words = counts
        self.wordDict = sorted(counts.keys())
        return IDF, counts

    # tf-idf(word) -> tf(word) * idf(word)
    def getTFIDF(self, tfDict: dict, idfDict: dict) -> dict:
        docTFIDFDict = {}
        for word in tfDict:
            docTFIDFDict[word] = tfDict[word] * idfDict[word]
        return docTFIDFDict


    def fit(self, corpus):
        # tf-idf(word) -> tf(word) * idf(word)
        TF = {}
        for i, doc in enumerate(corpus):
            TF[i] = self.getTF(doc, tokenize=True)
        # countDict = calculateCountDict(tfDict)
        IDF, counts = self.getIDF(TF)
        self.tfidfDict = [self.getTFIDF(tf, IDF) for tf in TF.values()]
        return self

    def calculateTFIDFVector(self, doctfidf):
        # Create an empty matrix to store the tfidf values
        tfidfVector = [0.0] * len(self.wordDict)

        # For each unique term, if it is in the document, store its TF-IDF value.
        for i, word in enumerate(self.wordDict):
            if word in doctfidf:
                tfidfVector[i] = doctfidf[word]
        return tfidfVector

    def get_feature_names(self):
        return [t for t, i in sorted(self.words.items(),
                              key=itemgetter(1))]

    def getMatrix(self):
        self.tfidfMatrix = [self.calculateTFIDFVector(doctfidf) for doctfidf in self.tfidfDict]
        return np.array(self.tfidfMatrix)

In [990]:
import io, os
import re as re
import zipfile as zipfile
import string
from collections import defaultdict, Counter
mytextzip = ''
corpus = []
labels = []
def readfromzip(zipname: str, n_docs: int) -> None:
    global mytextzip
    global c
    with zipfile.ZipFile(zipname) as z:
        for zipinfo in z.infolist():
            if zipinfo.filename.endswith('.txt') and re.search('raw_texts', zipinfo.filename):
                with z.open(zipinfo) as f:
                    textfile = io.TextIOWrapper(f, encoding='cp1254', newline='\r\n')
                    for line in textfile:
                        if len(line):
                            if re.search(r'([a-zA-Z]+\r\n)', line):
                                mytextzip += line.strip() + ' '
                                continue
                            mytextzip += line.strip()
                    corpus.append(mytextzip)
                    labels.append(zipinfo.filename.split("/")[2])
                    if len(corpus) >= n_docs:
                        break
                    mytextzip =''
readfromzip('film_yorumlari.zip',1500)

In [991]:
class NaiveBayes:

    def fit(self, X, y):
        subdata = self.seperate(X, y)

        # find mean for every class
        self.means= {k: np.mean(subdata[k], axis=0) for k in subdata.keys()}
        # find var for every class, smooth the variance to avoid divide by zero. (sklearn uses the same method)
        self.vars = {k: np.var(subdata[k], axis=0) + 1e-9 * np.var(X, axis = 0).max() for k in subdata.keys()}


    def seperate(self, features, labels):
        # seperate every class of feature in a dict
        self.subdata = defaultdict(list)
        self.classes = list(set(labels))
        for idx in range(len(features)):
            self.subdata[labels[idx]].append(features[idx])
        length = len(labels)
        #freq is the prior probability
        self.priors = dict(map(lambda item: (item[0], (item[1]/length)),Counter(labels).items()))
        return self.subdata

    def prob(self, x, class_idx):
        mean = self.means[class_idx]
        var = self.vars[class_idx]
        # numerator = np.exp(-(x - mean) ** 2 / (2 * std))
        # denominator = np.sqrt(2 * np.pi * std)
        # print(math.sqrt(2*np.pi*std))
        # print(numerator, denominator)
        # return numerator / denominator
        sum_cp = - (0.5 * np.sum(((x - mean) ** 2) / var))
        sum_cp += - (0.5 * np.sum(np.log(2 * np.pi * var)))
        return sum_cp

    def _pred(self, x):
        # predict the class of the feature
        posteriors = []
        # calculate the probability of the feature for each class
        for idx,c in enumerate(self.classes):
            # prior probability of every class
            prior = np.log(self.priors[c])
            # sum of conditional probability of the feature vector
            sum_cp = np.sum(self.prob(x, class_idx=c))
            # total probability
            posterior = prior + sum_cp
            posteriors.append(posterior)
        return self.classes[np.argmax(posteriors)]

    def pred(self, X):
        # call _pred for every feature vector
        y_pred = [self._pred(x) for x in X]
        return np.array(y_pred)

In [992]:
tfidf = TfidfVectorizer(tokenizer=tokenizer, stopwords= stop_words)
tfidf = tfidf.fit(corpus)
tf_matrix = tfidf.getMatrix()

In [993]:
# tf_matrix[:5,:]

In [994]:
def split(X, y, test_size, set_seed):
    arrays = [X,y]
    assert all(len(arr) == len(arrays[0]) for arr in arrays)
    seed = np.random.randint(0, 2**(32 - 1) - 1) if set_seed < 0 else set_seed
    
    for arr in arrays:
        rstate = np.random.RandomState(seed)
        rstate.shuffle(arr)
        
    size = int(len(X)*test_size)
    x_train, x_test = X[size:,:], X[:size,:]
    y_train, y_test = y[size:], y[:size]
    return x_train, x_test, y_train, y_test



In [995]:
from sklearn.naive_bayes import GaussianNB
# from sklearn.model_selection import train_test_split
# X_train,X_test,y_train,y_test = train_test_split(tf_matrix, labels, test_size = 0.15, random_state = 23)
X_train, X_test, y_train, y_test = split(tf_matrix, labels, test_size=0.15,set_seed=-1)

In [996]:
nb = NaiveBayes()
nb.fit(X_train, y_train)

In [997]:
y_pred = nb.pred(X_test)

In [998]:
def accuracy(y_true, y_pred):
    acc = 0
    assert len(y_true) == len(y_pred)
    for i in range(len(y_true)):
        if y_true[i] == y_pred[i]:
            acc += 1
    return (acc*100)/len(y_true)

### Heuristic Implementation accuracy

In [999]:
acc = accuracy(y_test, y_pred)
print("Accuracy: {:.2f}%".format(acc))

Accuracy: 66.67%


In [1000]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

### Sklearn Implementation accuracy


In [1001]:
y_pred = gnb.predict(X_test)
acc = accuracy(y_test, y_pred)
print("Accuracy: {:.2f}%".format(acc))

Accuracy: 66.67%
