# Text Classification in Python

## Python Implementation of Previous Chapter

In [1]:
import re
import os

In [4]:
def dict_merge_sum(d1, d2):
    return { k: d1.get(k, 0) + d2.get(k, 0) for k in set(d1) | set(d2)}

d1 = dict(a=4, b=5, d=8)
d2 = dict(a=1, d=10, e=9)

dict_merge_sum(d1, d2)

{'b': 5, 'd': 18, 'a': 5, 'e': 9}

In [3]:
class BagOfWords(object):
    def __init__(self):
        self.__number_of_words = 0
        self.__bag_of_words = {}

    def __add__(self, other):
        erg = BagOfWords()
        erg.__bag_of_words = dict_merge_sum(self.__bag_of_words, other.__bag_of_words)

    def add_word(self, word):
        self.__number_of_words += 1

        if word in self.__bag_of_words:
            self.__bag_of_words[word] += 1
        else:
            self.__bag_of_words[word] = 1

    def len(self):
        return len(self.__bag_of_words)
    
    def words(self):
        return self.__bag_of_words.keys()
    
    def word_freq(self, word):
        if word in self.__bag_of_words:
            return self.__bag_of_words[word]
        else:
            return 0

In [5]:
class Document(object):
    _vocabulary = BagOfWords()

    def __init__(self, vocabluary):
        self.__name = ''
        self.__document_class = None
        self._words_and_freq = BagOfWords()

        Document._vocabulary = vocabluary

    def read_document(self, filename, learn=False):
        try:
            text = open(filename, 'r', encoding='utf-8').read()
        except UnicodeDecodeError:
            text = open(filename, 'r', encoding='latin-1').read()
        
        text = text.lower()
        words = re.split(r'\W', text)

        self._number_of_words = 0

        for word in words:
            self._words_and_freq.add_word(word)
            if learn:
                Document._vocabulary.add_word(word)

    def __add__(self, other):
        res = Document(Document._vocabulary)
        res._words_and_freq = self._words_and_freq + other._words_and_freq

        return res
    
    def vocabulary_length(self):
        return len(Document._vocabulary)
    
    def words_and_freq(self):
        return self._words_and_freq.BagOfWords()
    
    def words(self):
        d = self._words_and_freq.BagOfWords()

        return d.keys()
    
    def word_freq(self, word):
        bow = self._words_and_freq.BagOfWords()

        if word in bow:
            return bow[word]
        else:
            return 0
        
    def __and__(self, other):
        intersection = []

        words1 = self.words()

        for word in other.words():
            if word in words1:
                intersection += [word]

        return intersection

In [6]:
class Category(Document):
    def __init__(self, vocabulary):
        Document.__init__(self, vocabulary)
        self._number_of_docs = 0

    def probability(self, word):
        voc_len = Document._vocabulary.len()
        sumN = 0

        for i in range(voc_len):
            sumN = Category._vocabulary.word_freq(word)

        n = self._words_and_freq.word_freq
        erg = 1 + n
        erg /= voc_len + sumN

        return erg
    
    def __add__(self, other):
        res = Category(self._vocabulary)
        res._words_and_freq = self._words_and_freq + other._words_and_freq

        return res
    
    def set_number_of_docs(self, number):
        self._number_of_docs = number

    def number_of_documents(self):
        return self._number_of_docs

In [8]:
class Pool(object):
    def __init__(self):
        self.__document_classes = {}
        self.__vocabulary = BagOfWords()

    def sum_words_in_class(self, dclass):
        sum = 0

        for word in self.__vocabulary.words():
            waf = self.__document_classes[dclass].words_and_freq()

            if word in waf:
                sum += waf[word]
        
        return sum
    
    def learn(self, directory, dclass_name):
        x = Category(self.__vocabulary)
        dir = os.listdir(directory)

        for file in dir:
            d = Document(self.__vocabulary)
            d.read_document(directory + '/' + file, learn=True)

            x = x + d
        
        self.__document_classes[dclass_name] = x
        
        x.SetNumberOfDocs(len(dir))

    def probability(self, doc, dclass=''):
        if dclass:
            sum_dclass = self.sum_words_in_class(dclass)

            prob = 0

            d = Document(self.__vocabulary)
            d.read_document(doc)

            for j in self.__document_classes:
                sum_j = self.sum_words_in_class(j)
                prod = 1

                for i in d.words():
                    wd_dfclass = 1 + self.__document_classes[dclass].word_freq()
                    wf = 1 + self.__document_classes[j].word_freq(i)
                    r = wf * sum_dclass / (wf_dclass * sum_j)
                    prod *= r
                
                prob += prod * self.__document_classes[j].number_of_documents() / self.__document_classes[dclass].number_of_documents()

            if prob != 0:
                return 1 / prob
            else:
                return -1
        else:
            prob_list = []

            for dclass in self.__document_classes:
                prob = self.probability(doc, dclass)
                prob_list.append([dclass, prob])
            prob_list.sort(key=lambda x: x[1], reverse=True)

            return prob_list
        
    def document_intersection_with_classes(self, doc_name):
        res = [doc_name]

        for dc in self.__document_classes:
            d = Document(self.__vocabulary)
            d.read_document(doc_name, learn=False)

            o = self.__document_classes[dc] & d
            intersection_ratio = len(o) / len(d.words())

            res += (dc, intersection_ratio)

        return res

In [None]:
# DClasses = ['clinton', 'lawyer', 'math', 'medical', 'music', 'sex']

# base = 'data/jokes/learn/'
# p = Pool()
# for dclass in DClass:
#     p.learn(base + dclass, dclass)

# base = 'data/jokes/test/'
# results = []
# for dclass in DClasses:
#     dir = os.listdir(base + dclass)

#     for file in dir:
#         res = p.probability(base + dclass + '/' + file)
#         results.append(f'{dclass}: {file}: {str(res)}')

# print(results[:10])