In [1]:
import pandas as pd

dataset = pd.read_csv('./dataset.csv', 
                    header=None, 
                    names=["Comment", "Movie" ,"Bool"])

dataset = dataset.tail(-1)
dataset = dataset.drop("Movie", 1)

dataset = dataset.sample(frac=.01, random_state=1).reset_index(drop=True)

dataset.head()

Unnamed: 0,Comment,Bool
0,\n ben bu kadar g√ºzel bir...,50
1,"\n En yakƒ±n dostumun ""E≈ü≈ü...",40
2,\n son zamanlarda seyrett...,5
3,\n yer yer samimi ve amat...,25
4,\n Klasik korku i√ßermeyen...,20


In [2]:
def map_points(x):
    x = float(x.replace(',', '.'))
    if x < 3.5:
        return False
    else:
        return True
def clean_values(x):
    x = x.replace('\n', ' ')
    x = x.replace('\W', ' ')
    x = x.lower()
    return x

dataset['Bool'] = dataset['Bool'].apply(map_points)
dataset['Comment'] = dataset['Comment'].apply(clean_values)
dataset['Comment'] = dataset['Comment'].str.split()

dataset.head()

Unnamed: 0,Comment,Bool
0,"[ben, bu, kadar, g√ºzel, bir, film, ne, zamandƒ±...",True
1,"[en, yakƒ±n, dostumun, ""e≈ü≈üek, kadar, adam, old...",True
2,"[son, zamanlarda, seyrettiƒüim, en, k√∂t√º, film,...",False
3,"[yer, yer, samimi, ve, amat√∂r, tadƒ±yla, hafif,...",False
4,"[klasik, korku, i√ßermeyen, kanlƒ±, bir, film, b...",False


In [3]:
vocabulary = []
for comment in dataset['Comment']:
    for word in comment:
        vocabulary.append(word)
# zayƒ±flƒ±klardan biri datasetin temizlenmesine √ßok baƒülƒ±
vocabulary = list(set(vocabulary))

# datanƒ±n temizlenmesi ve s√∂zl√ºƒü√ºn olu≈ümasƒ± burada bitiyor

print(vocabulary)

['oyuncluklar', 'bakarsanƒ±z', 'izleyin...', 'kahramanƒ±mƒ±zƒ±n', 'sydney', 'buzdolabƒ±nda', '≈üirkete', '√ßalƒ±≈ümƒ±≈ülar.', 'd√∂n√ºp', 'g√∂r√º≈ü√ºnden,', 'kareografileriyle', 'isabetli', 'd√º≈ü√ºn√ºn,ve', 'enteresan.ama', 'edilmelerine', 'bo≈ülukta', 'alt', 'sanƒ±rsƒ±n', 'diil)', '√ßƒ±kmakta.', 'sunuyor.hollywood', 'deyildi', 'izliyoruz.', 'kurtaran', 'senaristi', 'this', "bernal'ƒ±n", 'vurmu≈ü', 'manifestoda.', 'susalƒ±m', 'duygusu', 'soz', 'sƒ±rpƒ±', 'aƒülayabilirsiniz..', 'linklater)', 'emegi', 'otorite', 'sularƒ±nda', 'ƒ±cƒ±nden', 'hazƒ±rken', 'yanƒ±nda,', "50'li", 'zekanƒ±n', 'sizlerin', 'vasat', '√ßeker', '‚Äò', 'bazƒ±larƒ±', 'hayatƒ±mƒ±zda', 'gereken,m√ºkemmel', 'alex', '≈üekil', 'diyemem.reeves', 'yetersiz...', 'arasƒ±nda', 'var.iÃánsanlarƒ±mƒ±z', 'nefeste', 'firari', 'oluyor.ayrƒ±ca', 'ekibinin', 'altƒ±nda.aslƒ±nda', 'konus', 'ger√ßekle≈ütirmek', 'oyundan', 'su', 'ettiniz.', 'sevimliydi', 'kƒ±yƒ±m,', 'durumdayƒ±m!', 'titanic', '10/9', 'g√∂z√ºyle', 'anlar)..ve', 'pride', 'mem

In [4]:
# frekans tablosu olu≈üturuyoruz
word_counts_per_comment = {unique_word: [0] * len(dataset['Comment']) for unique_word in vocabulary}

for index, comment in enumerate(dataset['Comment']):
    for word in comment:
        word_counts_per_comment[word][index] += 1

word_counts = pd.DataFrame(word_counts_per_comment)
word_counts.head()

Unnamed: 0,oyuncluklar,bakarsanƒ±z,izleyin...,kahramanƒ±mƒ±zƒ±n,sydney,buzdolabƒ±nda,≈üirkete,√ßalƒ±≈ümƒ±≈ülar.,d√∂n√ºp,"g√∂r√º≈ü√ºnden,",...,etmi≈ü,deƒüilim.ankarada,kullanmƒ±≈ü.,fikrim,hangi,kurtulmu≈ü,bulanik,yapilmis,1-2,haksƒ±z
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
dataset_joined = pd.concat([dataset, word_counts], axis=1)

dataset_joined.head()

Unnamed: 0,Comment,Bool,oyuncluklar,bakarsanƒ±z,izleyin...,kahramanƒ±mƒ±zƒ±n,sydney,buzdolabƒ±nda,≈üirkete,√ßalƒ±≈ümƒ±≈ülar.,...,etmi≈ü,deƒüilim.ankarada,kullanmƒ±≈ü.,fikrim,hangi,kurtulmu≈ü,bulanik,yapilmis,1-2,haksƒ±z
0,"[ben, bu, kadar, g√ºzel, bir, film, ne, zamandƒ±...",True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[en, yakƒ±n, dostumun, ""e≈ü≈üek, kadar, adam, old...",True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"[son, zamanlarda, seyrettiƒüim, en, k√∂t√º, film,...",False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[yer, yer, samimi, ve, amat√∂r, tadƒ±yla, hafif,...",False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[klasik, korku, i√ßermeyen, kanlƒ±, bir, film, b...",False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
import numpy as np

positive_values = dataset_joined[dataset_joined['Bool'] == np.bool_(True)]
negative_values = dataset_joined[dataset_joined['Bool'] == np.bool_(False)]

#p(durum)
def find_class_prior_prob(positive_values, negative_values):
    positive_percentage = positive_values.shape[0] / len(dataset_joined)
    negative_percentage = negative_values.shape[0] / len(dataset_joined)
    
    return positive_percentage, negative_percentage
    
positive_percentage, negative_percentage = find_class_prior_prob(positive_values, negative_values)

#n deƒüeri
def n(positive_values, negative_values):
    n_words_per_positive_message = positive_values['Comment'].apply(len)
    n_positive = n_words_per_positive_message.sum()

    n_words_per_negative_message = negative_values['Comment'].apply(len)
    n_negative = n_words_per_negative_message.sum()
    
    return n_positive, n_negative

n_positive, n_negative = nc(positive_values, negative_values)

#vocabulary uzunluƒüu
n_vocabulary = len(vocabulary)

alpha = 1 # laplace smoothing

In [7]:
parameters_positive = {unique_word:0 for unique_word in vocabulary}
parameters_negative = {unique_word:0 for unique_word in vocabulary}

# datasetteki her kelime i√ßin p(kelime|durum hesaplanmasƒ±)

for word in vocabulary:
    #nc
    n_word_given_positive = positive_values[word].sum()
    #m estimate
    p_word_given_positive = (n_word_given_positive + alpha) / (n_positive + alpha*n_vocabulary)
    parameters_positive[word] = p_word_given_positive
    #nc
    n_word_given_negative = negative_values[word].sum()
    #m estimate
    p_word_given_negative = (n_word_given_negative + alpha) / (n_negative + alpha*n_vocabulary)
    parameters_negative[word] = p_word_given_negative

In [8]:
import re
import nltk
from stop_words import stop_words

WPT = nltk.WordPunctTokenizer()
# Yukarƒ±daki rowlarƒ± normalize etme i≈ülemine de bu method uygulanabilir
# bir ara implente et.
def norm_doc(single_doc):
    # TR: Dok√ºmandan belirlenen √∂zel karakterleri ve sayƒ±larƒ± at
    # EN: Remove special characters and numbers
    single_doc = re.sub(" \d+", " ", single_doc)
    pattern = r"[{}]".format(",.;") 
    single_doc = re.sub(pattern, "", single_doc) 
    # TR: Dok√ºmanƒ± k√º√ß√ºk harflere √ßevir
    # EN: Convert document to lowercase
    single_doc = single_doc.lower()
    single_doc = single_doc.strip()
    # TR: Dok√ºmanƒ± token'larƒ±na ayƒ±r
    # EN: Tokenize documents
    tokens = WPT.tokenize(single_doc)
    # TR: Stop-word listesindeki kelimeler hari√ß al
    # EN: Filter out the stop-words 
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # TR: Dok√ºmanƒ± tekrar olu≈ütur
    # EN: Reconstruct the document
    single_doc = ' '.join(filtered_tokens)
    return single_doc

def norm_values(perc_positive, perc_negative):
    sum = perc_positive + perc_negative
    norm_positive = perc_positive / sum
    norm_negative = perc_negative / sum
    
    return norm_positive, norm_negative

def classify(message):
    message = re.sub('\W', ' ', message)
    message = norm_doc(message).split()

    p_positive_given_message = positive_percentage
    p_negative_given_message = negative_percentage

    for word in message:
        if word in parameters_positive:
            print("positive ", word, " ", parameters_positive[word])
            p_positive_given_message *= parameters_positive[word]
            
        if word in parameters_negative:
            print("negative ", word, " ", parameters_negative[word])
            p_negative_given_message *= parameters_negative[word]
    
    confidence = norm_values(p_positive_given_message, p_negative_given_message)

    if p_negative_given_message < p_positive_given_message:
        return True, confidence[0], confidence[1]
    elif p_negative_given_message > p_positive_given_message:
        return False, confidence[0], confidence[1]

In [9]:
classify("harika bir g√ºn")

positive  harika   0.0010954576720250682
negative  harika   0.00011792916388222808
positive  g√ºn   0.00030570911777443765
negative  g√ºn   0.00019654860647038013


(True, 0.9614859353340232, 0.038514064665976844)