In [2]:
import math
import os
import re
import string
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
corpus_path = "Materials/"
train_path = os.path.join(corpus_path,"train/")
test_path = os.path.join(corpus_path,"test/")
stopword_path = os.path.join(corpus_path,"stopwords.txt")

In [3]:
train_files_list = os.listdir(train_path)
NUM_CLASSES = len(train_files_list)
NUM_FEATURES = 10
CLASSES = []
for i in range(NUM_CLASSES):
    CLASSES.append(train_files_list[i].split('.')[0])
print("Classes are:",CLASSES)

Classes are: ['Biology', 'Chemistry', 'Physics']


In [4]:
texts = []
for file in train_files_list:
    train_file = os.path.join(train_path,file)
    print("Reading file",file)
    with open(train_file,encoding='utf8') as f:
        texts.append(f.read())

Reading file Biology.txt
Reading file Chemistry.txt
Reading file Physics.txt


In [5]:
with open(stopword_path,encoding='utf8') as s_f:
    stopwords = s_f.read().split()
print("Some stopwords we remove\n",stopwords[:20])    

Some stopwords we remove
 ['a', "a's", 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along']


In [6]:
def normalize_text(texts):
    norm_texts = []
    for text in texts:
        # Lower case
        text = text.lower()
        list_words = text.split()
        words_wo_stop = []
        # Stop word Removal
        for word in list_words:
            if word not in stopwords:
                # Lemmatization
                lemmatizer = WordNetLemmatizer()
                word = lemmatizer.lemmatize(word)
                # Stemming
                stemmer = SnowballStemmer("english")
                word = stemmer.stem(word)
                words_wo_stop.append(word)
        text = " ".join(words_wo_stop)
        # Remove punctuation and numbers
        text = re.sub('['+string.punctuation+']', ' ', text)
        text = re.sub('[·´]','',text)
        text = re.sub('[0-9]','',text)
        
        norm_texts.append(text.split())
    return norm_texts

In [7]:
normalized_texts = normalize_text(texts)

In [8]:
print(normalized_texts[0][:100])
min_length = 1000000000000
for i in range(NUM_CLASSES):
    if len(normalized_texts[i]) < min_length:
        min_length = len(normalized_texts[i])
    print("Length of Document",i,"is",len(normalized_texts[i]))
print("Min Length =",min_length)

['foreword', 'preserv', 'anim', 'plant', 'life', 'general', 'beauti', 'nature', 'foremost', 'duti', 'men', 'woman', 'to', 'day', 'imper', 'duty', 'perform', 'once', 'late', 'mean', 'preservation', 'sentimental', 'educ', 'legislative', 'must', 'employed', 'present', 'warn', 'issu', 'uncertain', 'sound', 'great', 'battl', 'preserv', 'conserv', 'won', 'gentl', 'tones', 'appeal', 'aesthet', 'instinct', 'sens', 'beauty', 'enjoy', 'nature', 'sound', 'loud', 'alarm', 'present', 'fact', 'strong', 'language', 'back', 'irrefut', 'statist', 'photograph', 'lies', 'establish', 'law', 'enforc', 'bludgeon', 'book', 'alarm', 'call', 'forc', 'page', 'remind', 'sound', 'great', 'bell', 'watch', 'tow', 'citi', 'middl', 'age', 'call', 'citizen', 'arm', 'protect', 'homes', 'liberti', 'happiness', 'undeni', 'welfar', 'happi', 'futur', 'generat', 'american', 'stake', 'battl', 'preserv', 'natur', 'selfishness', 'ignorance', 'cruelti', 'destroyers', 'longer', 'destroy', 'great', 'work']
Length of Document 0 is

In [9]:
TRUNCATE_FILES = False
if TRUNCATE_FILES:
    normalized_texts = [text[:min_length+1] for text in normalized_texts]

In [10]:
def get_counts(texts):
    counts_list = []
    for text in texts:
        uni_counts = {}
        txt_list = text
        for w in txt_list:
            if w not in uni_counts:
                uni_counts[w] = 1
            else:
                uni_counts[w] += 1
        counts_list.append(uni_counts)
    return counts_list

In [11]:
counts_dicts = get_counts(normalized_texts)

In [12]:
# Generate vocabulary
vocabulary = []
N_train = 0
counts_train = {}
for count_dict in counts_dicts:
    vocabulary = vocabulary + list(count_dict.keys())
    print("Length of doc:",sum(list(count_dict.values())))
    N_train += sum(list(count_dict.values()))
    for w in count_dict:
        if w not in counts_train:
            counts_train[w] = count_dict[w]
        else:
            counts_train[w] += count_dict[w]
vocabulary = set(vocabulary)
print(len(vocabulary))
print(N_train)
print(len(counts_train))

Length of doc: 88903
Length of doc: 57676
Length of doc: 42308
14150
188887
14150


In [13]:
def generate_probs(train_1,vocabulary,smooth=False,alpha=0.0,N=None,V=None):
    #vocabs = []
    probs_1 = {}
    assert N is not None
    if smooth:
        assert V is not None
    for word in vocabulary:
        if word in train_1:
    #for word in train_1:
            if not smooth:
                #probs_1 = {k:v/N for k,v in train_1.items()}
                probs_1[word] = train_1[word] / N
            else:
                probs_1[word] = (train_1[word] + alpha) / (N + alpha * V)
        else:
            if not smooth:
            #probs_1 = {k:v/N for k,v in train_1.items()}
                probs_1[word] = 0.0
            else:
                probs_1[word] = (0.0 + alpha) / (N + alpha * V)
    return probs_1

In [14]:
# PMI Computation
pmi_list = []
train_prob_dicts = []
for i in range(NUM_CLASSES):
    pmi_dict = {}
    # Get counts
    count_dict = counts_dicts[i]
    N = sum(list(count_dict.values()))
    # Generate the probabilities and store them
    prob_dict = generate_probs(count_dict,list(count_dict.keys()),N=N,smooth=False)
    #prob_dict = generate_probs(count_dict,vocabulary,N=N,smooth=True,V=len(vocabulary),alpha=0.1)
    train_prob_dicts.append(prob_dict)
    for word in prob_dict:
        try:
            pmi_dict[word] = math.log(prob_dict[word] / (counts_train[word] / N_train))
        except:
            # Should never come here
            print(word,prob_dict[word],counts_train[word],N_train)
    pmi_list.append(pmi_dict)

In [15]:
# Arrange by class
class_term_pmis = []
for i in range(1,NUM_CLASSES+1):
    class_term_pmis.append([])
for word in vocabulary:
    max_val = -10000000
    max_term = None
    max_class = None
    for k in range(1,NUM_CLASSES+1):
        pmi = pmi_list[k-1]
        if word in pmi:
            if pmi[word] > max_val:
                max_val = pmi[word]
                max_term = word
                max_class = k
    class_term_pmis[max_class-1].append((max_term,max_val))

In [16]:
# Top 10 of each
pmi_features_per_class = []
for c in range(1,NUM_CLASSES+1):
    pmis = class_term_pmis[c-1]
    print(train_files_list[c-1])
    sorted_by_second = sorted(pmis, key=lambda tup: tup[1],reverse=True)
    top_10 = sorted_by_second[:NUM_FEATURES]
    #print(top_10)
    #print("*************************************************")
    list_feats = [x[0] for x in top_10]
    #pmi_features_per_class += (list_feats)
    pmi_features_per_class.append(list_feats)
print("List of features:\n",pmi_features_per_class)

Biology.txt
Chemistry.txt
Physics.txt
List of features:
 [['deck', 'barwonleigh', 'colonist', 'rupe', 'gist', 'lust', 'thomas', 'disputed', 'chestnut', 'waigiou'], ['bacl', 'arsenious', 'rutherford', 'botanist', 'reduction', 'cacl', 'coinag', 'recovered', 'ccl', 'elements'], ['accommodated', 'jointly', 'ebullition', 'quaver', 'inexplicable', 'venus', 'benefactor', 'abxv', 'jupit', 'conchoid']]


In [17]:
# MI Computation
P_class = 1 / NUM_CLASSES
mi_list = []
for i in range(NUM_CLASSES):
    mi_dict = {}
    # Get counts
    count_dict = counts_dicts[i]
    N = sum(list(count_dict.values()))
    prob_dict = train_prob_dicts[i]
    for word in prob_dict:
        try:
            # Computation changes!!!
            mi_dict[word] = (prob_dict[word] * P_class) * (math.log((prob_dict[word]) / (counts_train[word] / N_train)))
            #mi_dict[word] = (prob_dict[word]) * (math.log((prob_dict[word]) / (counts_train[word] / N_train)))
        except:
            # Should never come here
            print(word,prob_dict[word],counts_train[word],N_train)
    mi_list.append(mi_dict)

In [18]:
#mi_list[1]

In [19]:
# Arrange by class
class_term_mis = []
for i in range(1,NUM_CLASSES+1):
    class_term_mis.append([])
for word in vocabulary:
    max_val = -10000000
    max_term = None
    max_class = None
    for k in range(1,NUM_CLASSES+1):
        mi = mi_list[k-1]
        if word in mi:
            if mi[word] > max_val:
                max_val = mi[word]
                max_term = word
                max_class = k
    class_term_mis[max_class-1].append((max_term,max_val))

In [20]:
# Top 10 of each
mi_features_per_class = []
for c in range(1,NUM_CLASSES+1):
    mis = class_term_mis[c-1]
    print(train_files_list[c-1])
    sorted_by_second = sorted(mis, key=lambda tup: tup[1],reverse=True)
    top_10 = sorted_by_second[:NUM_FEATURES]
    #print(top_10)
    #print("*************************************************")
    list_feats = [x[0] for x in top_10]
    #mi_features_per_class += (list_feats)
    mi_features_per_class.append(list_feats)
print("List of features:",mi_features_per_class)

Biology.txt
Chemistry.txt
Physics.txt
List of features: [['game', 'bird', 'wild', 'state', 'kill', 'law', 'year', 'protect', 'deer', 'life'], ['acid', 'form', 'h', 'oxygen', 'water', 'o', 'hydrogen', 'carbon', 'gas', 'element'], ['colour', 'light', 'ray', 'refract', 'd', 'prism', 'glass', 'red', 'part', 'distanc']]


In [21]:
# Read test data
test_files_list = os.listdir(test_path)
test_texts = []
for file in test_files_list:
    test_file = os.path.join(test_path,file)
    print("Reading file",test_file)
    with open(test_file,encoding='utf8') as f:
        test_texts.append(f.read())

Reading file Materials/test/test1.txt
Reading file Materials/test/test2.txt
Reading file Materials/test/test3.txt
Reading file Materials/test/test4.txt
Reading file Materials/test/test5.txt
Reading file Materials/test/test6.txt


In [22]:
# Normalize test texts
normalized_test_texts = normalize_text(test_texts)

In [23]:
# for i in range(NUM_CLASSES):
#     print("Printing features")
#     feature = mi_features_per_class[i]
#     for f in feature:
#         print("Count of",f,"is",counts_dicts[i][f])
#         print("MLE of",f,"is",train_prob_dicts[i][f])
# print("----------------------------------------")
# for text in normalized_test_texts:
#     print("Text")
#     for i in range(NUM_CLASSES):
#         feature = mi_features_per_class[i]
#         f_sum = 0
#         for f in feature:
#             print("Feature",f,"count",text.count(f))
#             f_sum += text.count(f)
#         print("Tot Count",f_sum)

In [24]:
print(normalized_test_texts[0][:100])

['\ufeffcontent', 'eha', 'foot', 'hand', 'ii', 'bill', 'bird', 'iii', 'tail', 'iv', 'nose', 'ear', 'vi', 'tommi', 'vii', 'barn', 'owl', 'viii', 'domest', 'anim', 'ix', 'snake', 'indian', 'snake', 'charm', 'xi', 'cure', 'snake', 'bit', 'xii', 'cobra', 'bungalow', 'xiii', 'panther', 'shoot', 'xiv', 'purbhoo', 'xv', 'coconut', 'tree', 'xvi', 'betel', 'nut', 'xvii', 'hindu', 'festiv', 'xviii', 'indian', 'poverti', 'xix', 'borrow', 'indian', 'word', 'special', 'due', 'editor', 'proprietor', 'strand', 'magazine', 'pall', 'mall', 'magazine', 'time', 'india', 'courtesi', 'permit', 'reprint', 'articl', 'book', 'origin', 'appear', 'columns', 'list', 'illustr', 'half', 'ton', 'eha', 'nose', 'eleph', 'hand', 'redeem', 'mind', 'good', 'rough', 'job', 'competit', 'keen', 'rat', 'relat', 'squirrel', 'zoolog', 'person', 'gutter', 'snipe', 'tail', 'drag', 'dirti', 'rope', 'blackbird', 'starling']


In [25]:
counts_test_dicts = get_counts(normalized_test_texts)

In [26]:
alpha = 0.1
unk_prob = []
for i in range(NUM_CLASSES):
    count_dict = counts_dicts[i]
    N = sum(list(count_dict.values()))
    unk_prob.append((0.0 + alpha) / (N + alpha * len(vocabulary)))
    #unk_prob.append((0.0 + alpha) / (N + alpha * len(vocabulary)))
#print(unk_prob)

In [27]:
'''
Naive Bayes Classifier
0 --> Biology
1 --> Chemistry
2 --> Physics
'''
for features in [pmi_features_per_class,mi_features_per_class]:
    preds = []
    if features == mi_features_per_class:
        print("**********Using MI Features**********")
    else:
        print("**********Using PMI Features**********")
    #features = pmi_features_per_class
    for text in normalized_test_texts:
        #vocab_test = list(set(text))
        vocab_test = text
        #print("Test vocab size:",len(vocab_test))
        prob_class = []
        '''
        Compute log probabilities. Then max_prob = max_log(prob)
        '''
        all_features = []
        for f in features:
            all_features += f
        for i in range((NUM_CLASSES)):
            log_prob = math.log(P_class)
            #print("LOG P_class:",log_prob)
            for token in vocab_test:
                #print("Features are:",features[i])
                #if token in features[i] and token in train_prob_dicts[i]:
                if token in all_features:
                    #print("Here")
                    if token in train_prob_dicts[i]:
                        log_prob += math.log(train_prob_dicts[i][token])
                    #else:
                    #    log_prob += math.log(unk_prob[i])
                #else:
                #    log_prob += math.log(unk_prob[i])
            prob_class.append(log_prob)
        #print("Probs:",prob_class)
        preds.append(prob_class.index(max(prob_class)))

    '''
    Now print the predictions
    '''
    for i in range(len(test_files_list)):
        print("Predicted class of document is",preds[i],"that is",CLASSES[preds[i]])

**********Using PMI Features**********
Predicted class of document is 1 that is Chemistry
Predicted class of document is 0 that is Biology
Predicted class of document is 2 that is Physics
Predicted class of document is 2 that is Physics
Predicted class of document is 0 that is Biology
Predicted class of document is 0 that is Biology
**********Using MI Features**********
Predicted class of document is 1 that is Chemistry
Predicted class of document is 2 that is Physics
Predicted class of document is 2 that is Physics
Predicted class of document is 0 that is Biology
Predicted class of document is 1 that is Chemistry
Predicted class of document is 1 that is Chemistry
