### Import libraries

In [1]:
import pandas as pd
from collections import Counter
from functools import reduce
import numpy as np
import re

### Loading data

In [2]:
data_train = list()
data_test = list()

# Extracting data from file
with open("fold436.train") as f:
    while True:
        line = f.readline()
        if not line:
            break
        else:
            line = line.strip()
            data_train.append(line)

# Extracting data from file
with open("fold436.test") as f:
    while True:
        line = f.readline()
        if not line:
            break
        else:
            line = line.strip()
            data_test.append(line)

### Preprocessing data

#### Loading Java reserved words

In [3]:
reserved_words = list()

with open("java_words.txt") as f:
    while True:
        line = f.readline()
        if not line:
            break
        else:
            line = line.strip()
            reserved_words.append(line)

In [4]:
def preprocess_data(data: list):
    processed_data = list()
    
    for item in data:
        tokens = item.split()
        processed = list()
        
        for token in tokens:
            if not token in reserved_words and token.isalpha():
                processed.append(token)
        processed_data.append(" ".join(processed))
    
    return processed_data

data_train = preprocess_data(data_train)
data_test = preprocess_data(data_test)

### Getting entropy values for the unigram model

In [5]:
def get_probabilities_unigram(data: list):
    data_join = " ".join(data)
    data_splitted = data_join.split()
    
    counter_names = dict(Counter(data_splitted))
    keys = list(counter_names.keys())
    total = sum(counter_names.values())

    probabilities = list(map(lambda number: number / total, counter_names.values()))
    probabilities_words = list(map(lambda word: probabilities[keys.index(word)], data_splitted))
    
    return probabilities_words

def get_entropy_unigram(sentence: list, data: list, probabilities: list):
    data_join = " ".join(data)
    data_splitted = data_join.split()
    
    sentence_join = " ".join(sentence)
    sentence_splitted = sentence_join.split()
    
    entropy_values = list()
    
    for token in sentence_splitted:
        count = data_splitted.count(token)
        if count:
            probability = probabilities[data_splitted.index(token)]
            entropy_values.append(probability * np.log2(probability))
        else:
            entropy_values.append(0)

    return -1 * sum(entropy_values)

probabilities_unigram = get_probabilities_unigram(data_train)
entropy_unigram = get_entropy_unigram(data_test, data_train, probabilities_unigram)

In [6]:
entropy_unigram

12.24369647410284

### Getting the entropy values for the bigram model

In [26]:
def get_probabilities_bigram(data: list):
    keys = list()
    counter = list()
    probabilities = list()
    
    for items in data:
        tokens = items.split()
        for i in range(len(tokens) - 1):
            couple = tokens[i] + " " + tokens[i + 1]
            keys.append(couple)
    
    counter_keys = dict(Counter(keys))

    data_join = " ".join(data)
    data_all_split = data_join.split()
    counter_names = dict(Counter(data_all_split))
    
    for key, value in counter_keys.items():
        probability = value / counter_names[key.split()[0]]
        probabilities.append(probability)
    
    return list(counter_keys.keys()), probabilities

def get_indices(keys: list, token: str):
    indices = list()
    
    for i in range(len(keys)):
        key = keys[i]
        first_half = key.split()[0]
        if first_half == token:
            indices.append(i)
    return indices

def get_entropy_bigram(sentence: list,
                       data: list,
                       keys: list,
                       probabilities_unigram: list,
                       probabilities_bigram: list):

    data_join = " ".join(data)
    data_all_split = data_join.split()
    
    entropy_values = list()
    
    for items in sentence:
        tokens = items.split()
        for i in range(len(tokens) - 1):
            couple = tokens[i] + " " + tokens[i + 1]
            if couple in keys:
                probability_unigram = probabilities_unigram[data_all_split.index(tokens[i])]

                indices = get_indices(keys, tokens[i])
                probs_bigram = np.array([probabilities_bigram[index] for index in indices])
                
                partial_result = probability_unigram * sum(probs_bigram * np.log2(probs_bigram))
                entropy_values.append(partial_result)
            else:
                entropy_values.append(0)
    return -1 * sum(entropy_values)

keys_bigram, probabilities_bigram = get_probabilities_bigram(data_train)
entropy_bigram = get_entropy_bigram(data_test, data_train, keys_bigram, probabilities_unigram, probabilities_bigram)

In [27]:
entropy_bigram

5.578933378491214

### Getting the entropy values for the trigram model

In [28]:
def get_probabilities_trigram(data: list):
    keys = list()
    counter = list()
    probabilities = list()
    
    for items in data:
        tokens = items.split()
        for i in range(len(tokens) - 2):
            triple = tokens[i] + " " + tokens[i + 1] + " " + tokens[i + 2]
            keys.append(triple)

    counter_keys = dict(Counter(keys))

    data_join = " ".join(data)
    data_all_split = data_join.split()
    counter_names = dict(Counter(data_all_split))
    
    for key, value in counter_keys.items():
        probability = value / counter_names[key.split()[0]]
        probabilities.append(probability)
    
    return list(counter_keys.keys()), probabilities


def get_entropy_trigram(sentence: list,
                       data: list,
                       keys: list,
                       keys_bigram: list,
                       probabilities_unigram: list,
                       probabilities_bigram: list,
                       probabilities_trigram: list):

    data_join = " ".join(data)
    data_all_split = data_join.split()
    
    entropy_values = list()
    
    for items in sentence:
        tokens = items.split()
        for i in range(len(tokens) - 2):
            triple = tokens[i] + " " + tokens[i + 1] + " " + tokens[i + 2]
            if triple in keys:
                probability_unigram = probabilities_unigram[data_all_split.index(tokens[i])]
                probability_bigram = probabilities_bigram[keys_bigram.index(tokens[i] + " " + tokens[i + 1])]

                indices = get_indices(keys, tokens[i])
                probs_trigram = np.array([probabilities_trigram[index] for index in indices])
                
                partial_result = probability_unigram * probability_bigram * sum(probs_trigram * np.log2(probs_trigram))
                entropy_values.append(partial_result)
            else:
                entropy_values.append(0)
    return -1 * sum(entropy_values)

keys_trigram, probabilities_trigram = get_probabilities_trigram(data_train)
entropy_trigram = get_entropy_trigram(data_test, data_train, keys_trigram, keys_bigram, probabilities_unigram, probabilities_bigram, probabilities_trigram)

In [30]:
entropy_trigram

1.8464322432160059