# Отчет по выполнению домашнего задания #2

__Дата выдачи__: 24.02.2016

__Дедлайн__: 23:59 12.03.2016

__Выполнил__: Булгаков Дмитрий (ИАД16)

In [315]:
import numpy as np
import scipy.misc as sc
import sklearn
import os
import string
import re
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import *

In [316]:
def convert_text(s):
    # Removes all characters from string except letters and digits and convert letters to lowercase
    return re.sub("[^a-zA-Z0-9]", " ", s.lower())

def read_txts(dir_path="./txt_sentoken/pos/"):
    # Reads all files from directory
    if dir_path[-1] != "/":
        dir_path = dir_path + "/"
    txt_list = []
    for file in os.listdir(dir_path):
        file = dir_path + file
        fin = open(file, 'r')
        txt = " ".join(fin.readlines())
        txt = convert_text(txt)
        txt_list.append(txt)
    return txt_list

In [317]:
class PoissonNB:
    def __init__(self, class_prior=None):
        """
        class_prior : np.array, size (n_classes,)
        Prior probabilities of the classes. If specified the priors are not
        adjusted according to the data.
        """
        self.class_prior = class_prior
        
        # checking data input to constructor
        if self.class_prior is not None:
            self.class_prior_pre = True
        else:
            self.class_prior_pre = False
    
    def fit(self, X, y, epsilon=1e-9):
        """
        Fit Poisson Naive Bayes according to X, y
        
        Parameters
        ----------
        X : np.array, shape (n_samples, n_features)
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : np.array, shape (n_samples,)
            Target values.
        """
        # finding classes vector (we need uniq only)
        self.class_vector = np.unique(y)
        # getting number of claseses
        self.class_number = self.class_vector.shape[0]
        # getting number of samples and features
        samples_number, features_number = X.shape
        
        # creating empty array(s) for calucations
        self.lambda_matrix = np.zeros((self.class_number, features_number))
        self.lambda_matrix.fill(epsilon)
        
        if not self.class_prior_pre:
            self.class_prior = np.zeros(self.class_number)
        
        # calculating prior propability matrix and lambda matrix
        for i, y_i in enumerate(self.class_vector):
            # finding elements in iterated class
            Xi = X[y_i == y]
            # only in case not present in constructor
            if not self.class_prior_pre:
                # calculating propbalitities for each class (assuming uniform distribution)
                self.class_prior[i] = Xi.shape[0] / samples_number
            # calculating lambdas matrix for next calucaltions (by formula from the task descr)
            self.lambda_matrix[i] += np.mean(Xi, axis=0)
        
    def predict(self, X):
        """
        Perform classification on an array of test vectors X.
        
        Parameters
        ----------
        X : np.array, shape = [n_samples, n_features]
        
        Returns
        -------
        C : np.array, shape = [n_samples]
            Predicted target values for X
        """
        # creating empty likehood matrix
        likehood_matrix = np.zeros((np.shape(X)[0], np.size(self.class_vector)))
        
        # calculating likehood matrix
        for i in range(self.class_number):
            # log of value due to small values
            c_prior = np.log(self.class_prior[i])
            
            # calculating propability function (by formula for Poisson distr)
            p_ij = np.sum(X * np.log(self.lambda_matrix[i]), axis=1) - np.sum(self.lambda_matrix[i]) - np.log(sc.factorial(X)).shape[0]
           
            # calculating by formula from task descr, '*' to '+' because logarithm usage
            likehood_matrix[:, i] = c_prior + p_ij
            
        avector = np.argmax(likehood_matrix, axis=1)
        return self.class_vector[avector]
        

__Решение задачи:__

__#1.__ Загрузите и преобразуйте данные с помощью функции read_txts() из выданного ноутбука. В итоге

должно получиться два списка: с положительными и с отрицательными рецензиями.

In [318]:
pos_rev = read_txts("./txt_sentoken/pos/")
neg_rev = read_txts("./txt_sentoken/neg/")

__#2__ Из всех рецензий сформируйте два списка: тексты для обучающей выборки (по 700 случайных каж-

дого класса) и для контрольной выборки (по 300 оставшихся), а также вектор правильных ответов

для обучающей и контрольной выборки. Например, положительные рецензии можно относить к

классу «1», а отрицательные — к классу «0».

In [319]:
X = np.hstack((pos_rev, neg_rev))
y = np.ones(len(pos_rev) + len(neg_rev))
y[len(pos_rev):] = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) 

__#3__ Прочитайте, как работает класс sklearn.feature_extraction.text.CountVectorizer, и с его помощью со-

здайте две матрицы «объекты × признаки»: для обучающей и контрольной выборки. Учтите, что

CountVectorizer.transform возвращает разреженную матрицу — чтобы преобразовать её к знакомому

нам np.array, воспользуйтесь функцией .toarray().

In [320]:
cv = CountVectorizer()
train_features = cv.fit_transform(X_train).toarray()
test_features = cv.transform(X_test).toarray()

__#4__ Сами реализуйте класс PoissonNB, реализующий пуассоновский наивный байесовский классифи-

катор. Методы, которые должны быть реализованы в этом классе, описаны в jupyter ноутбуке,

выданном вместе с заданием.

In [321]:
pnb = PoissonNB()
pnb.fit(train_features, y_train)

__#5__ Протестируйте ваш классификатор на данных и посчитайте accuracy — долю правильных ответов.

In [322]:
res_test = pnb.predict(test_features)
pnb_accuracy = np.mean(y_test == res_test)
print('Accuracy for Poisson naive Bayes classificator is ', pnb_accuracy * 100, "%")

Accuracy for Poisson naive Bayes classificator is  70.6666666667 %


__#6__ Протестируйте мультиномиальный и гауссовский наивный байесовский классификатор, реализованный в библиотеке scikit-learn (в sklearn.naive_bayes). Можно использовать параметры по умолчанию.

Multinominal classifier:

In [323]:
mnb = MultinomialNB()
mnb.fit(train_features, y_train)
res_test = mnb.predict(test_features)
mnb_accuracy = np.mean(y_test==res_test)

Gaussian classifier:

In [324]:
gnb = GaussianNB()
gnb.fit(train_features, y_train)
res_test = gnb.predict(test_features)
gnb_accuracy = np.mean(y_test==res_test)

Results:

In [325]:
print('Accuracy for Multinominal naive Bayes classificator is ', mnb_accuracy * 100, "%")
print('Accuracy for Gaussian naive Bayes classificator is ', gnb_accuracy * 100, "%")

Accuracy for Multinominal naive Bayes classificator is  81.8333333333 %
Accuracy for Gaussian naive Bayes classificator is  64.6666666667 %


__#7__ Напишите функцию, которая принимает на вход строку с текстом рецензии, обученный классифи-

катор, обученный объект класса CountVectorizer и печатает, положительна ли данная рецензия.

In [326]:
def text_estimate(text, classificator, count_vect):
    text = convert_text(text)
    text = count_vect.transform([text]).toarray()
    if classificator.predict(text):
        print('positive')
    else:
        print('negative')
        

In [327]:
sampletext = '''An incredible journey that began a decade ago finally arrives at its close with David Yates' "Harry Potter and the Deathly Hallows: Part II", as 'The Boy Who Lived' comes face to face with 'He Who Shall Not Be Named' in an epic showdown between good and evil. And what a showdown it is- tense, thrilling, breathtaking, and fitting of just about any superlative you can think of. 

Whereas the first instalment of the 'Deathly Hallows' emphasised the profound sense of loss and isolation among Harry, Ron and Hermoine, screenwriter Steve Kloves and director Yates leaves behind the moody atmosphere of the previous movie for newfound immediacy and urgency. This is all about that final battle where only one can live, and from start to finish- for once in a Harry Potter movie- the action is swift and relentless.

Part II picks up right where the previous film left off- the dark Lord Voldemort smiles in evil triumph as he steals the most powerful wand in the world, i.e. the Elder Wand, from the tomb of beloved Hogwarts headmaster Professor Dumbledore (Michael Gambon). The next shot is equally ominous- students are marched rank-and-file through a Hogwarts courtyard, watched closely by cloaked Dementors hovering over the school grounds. If there was any need of a reminder of the danger facing our three protagonists, these opening sequences should just about refresh one's memory of what is at stake.

There is precious little time to waste, and the first we get to see Harry, Ron and Hermione, they are already hatching a plan to break into Gringotts to retrieve a Horcrux. Their break-in settles upon a plan of deception that allows for some rare moments of levity in the film, as Helena Bonham Carter gets to ham it up as a polyjuice-disguised Hermoine impersonating Bellatrix Lestrange. This being the first 'Harry Potter' movie in 3D, Yates caters for some distinctive thrills in the additional dimension with a roller-coaster ride through the vault, culminating in a daring escape on the back of a dragon.

But as readers of the book will tell you, the last stand happens back at Hogwarts, and true enough, after this thrilling early set-piece at Gringotts, the trio head back to the School of Witchcraft and Wizardry to confront their foes. It is also where the last Horcruxes are supposed to be, and Harry's return to the once sunny and cheery grounds now besieged by darkness and doom becomes a true test of allegiance. 

Fans will be glad that Kloves gives room for otherwise supporting characters to step into the limelight- in particular, Neville Longbottom (Matthew Lewis) emerges as one of the unlikeliest but also truest heroes on the side of good. The Hogwarts stalwarts also get a chance to show off their magic, and Yates gives each largely enough screen time for the heroic send-off they deserve.

Yet he reserves the most emotional moment in the film for Severus Snape's (Alan Rickman) vindication, long thought to be the Judas Iscariot-equivalent in the Order and the one who pushed Dumbledore to his death. Yates delivers a truly poignant and deeply heartfelt revelation of Snape's true colours, and it is a farewell that even those who have read the book and can expect what is to come will be overwhelmed by its sheer emotional muscle. While Part II was always meant to be an action-packed spectacle, it is to Yates' credit that there is still as much heart as before in the storytelling.

Though brief, this revelation also works brilliantly as a catalyst that propels Harry to come to terms with the sacrifice he has to make. Harry's realization of this leads up perfectly to the ultimate duel between him and Voldemort, one that is fierce, ferocious and- thanks to Yates' imagination- more exhilarating than reading it off the page. 

The outcome of that battle shouldn't be a secret by now, and when the 'happily-ever-after' coda in Rowling's book set 19 years later is also faithfully adapted here, you can't quite help but be moved by how it so properly gives the series closure.

They are of course no longer kids here- Daniel Radcliffe, Rupert Grint and Emma Watson now young adults who have through the film series grown up right under our eyes. While Part I had greater emphasis on Ron and Hermoine, the focus here is squarely on Harry and Radcliffe truly shines in this instalment- his usual understated performance allowing his audience to appreciate the enormities of the challenge before Harry.

That we can be so fully immersed in Harry's world is testament to the craft of each and every one of the technical team. Production designer Stuart Craig does a masterful job portraying the devastation around Hogwarts, complimented nicely by Eduardo Serra's beautiful cinematography and Mark Day's skillful editing. Alexandre Desplat's evocative score, which combines his own elegiac work with both the John Williams theme as well as Nicholas Hooper's mournful composition for the sixth movie, works magic with the visuals. And most deserving of credit is none other than director Yates himself, who has matured movie after movie to deliver a crowning achievement for the series.

Pardon us if we have also taken this opportunity to extol the merits of the 'Harry Potter' franchise- it's really hard not to considering how this is the last time we will see the Potter-world in its current incarnation. It is this to which the movie is a farewell to, and it is as beautiful a farewell as it can be, packed with visual spectacle on a scale never before seen in any of the other films and fused with the same powerful emotion as Part I and the Yates films before. All good- even great- things have to come to an end, so there is really no better way to bid adieu than with this grand and glorious final chapter.'''

In [328]:
# print('Estimating text class for text: ', sampletext)
print('Result: \nPoisson:')
text_estimate(sampletext, pnb, cv)
print('Gaussian:')
text_estimate(sampletext, gnb, cv)
print('Multinominal:')
text_estimate(sampletext, mnb, cv)

Result: 
Poisson:
positive
Gaussian:
positive
Multinominal:
positive


__#8__ Сделайте выводы, почему наивный байесовский классификатор плохо или хорошо работает для данной задачи

Наивный байесовский классификатор хорошо подходит для широкого круга задач по анализу и классификации данных, и для данной задачи в частности. Это, в первую очередь, связанно с легкостью и высокой скоростью обучения данного классификатора ввиду большого упрощения задачи в принятии независимости всех признаков по совокупности  без существенной потери в эффективности предсказания (одно из основных преимуществ данного классификатора). Во вторых, использование данного классификатора рационально для данной задачи в виду необходимости малого числа данных для его обучения, а реализуемая задача как раз располагает ограниченным количеством данных в 2000 отзывов. К минусам же можно отнести тот факт, что в случае встречи неизвестного слова будет получена 0 вероятность и классификация будет затруднена без дополнительных изменений алгоритма.  