In [1]:
import math
import pandas as pd

### Reading the files 

In [2]:
def readFile(fileName):
    data = []
    file = open(fileName, "r")
    for line in file:
        l = line.strip().split()
        l = [int(i) for i in l]
        data.append(l)
    return data

In [3]:
X = readFile(r"X.txt")
y = readFile(r"Y.txt")

### Parameters and Initialization

In [4]:
p = [0.05]*23
n = 23
T = 267
max_iter = 256

### Log-likelihood, EM Update and Error functions 

In [5]:
def log_likelihood(p,X,y):
    ll = 0
    for i in range(T):
        val = 1
        for j in range(n):
            val *= pow(1-p[j],X[i][j])
        p1 = 1-val
        
        if(y[i][0]==1):
            ll += math.log(p1)
        else:
            ll += math.log(val)
    ll = round((ll/T),5)     
    return ll

In [6]:
def mistakes(p,X,y):
    err = 0
    for i in range(T):
        val = 1
        for j in range(n):
            val *= pow(1-p[j],X[i][j])
        p1 = 1-val
        if((p1>=0.5 and y[i][0]==0) or (p1<=0.5 and y[i][0]==1)):
            err += 1
    return err

In [7]:
def emUpdate(p,X,y,max_iter,res):
    output_iter = [1,2,4,8,16,32,64,128,256]
    for i in range(max_iter):
        pu = [0]*n
        Ti = [0]*n
        for j in range(T):
            val = 1
            for k in range(n):
                val *= pow(1-p[k],X[j][k])
                Ti[k] += X[j][k]
                
            for k in range(n):
                pu[k] += (X[j][k]*y[j][0]*p[k])/(1-val)
        
        for j in range(n):
            pu[j] = pu[j]/Ti[j]
        
        p = pu
        
        if i+1 in output_iter:
            ll = log_likelihood(p,X,y)
            m = mistakes(p,X,y)
            res.append([i+1,m,ll])
    return res

### Performance at  iteration 0

In [8]:
m = mistakes(p,X,y)
print("Initial mistakes: "+ str(m))

Initial mistakes: 175


In [9]:
ll = log_likelihood(p,X,y)
print("Initial log-likelihood: "+ str(ll))

Initial log-likelihood: -0.95809


In [10]:
res = [[0,m,ll]]

### Model Training

In [11]:
ans = emUpdate(p,X,y,256,res)

### Result

In [12]:
pd.DataFrame(ans,columns = ["Iteration","Number of Mistakes","Log-Likelihood"])

Unnamed: 0,Iteration,Number of Mistakes,Log-Likelihood
0,0,175,-0.95809
1,1,56,-0.49592
2,2,43,-0.40822
3,4,42,-0.36461
4,8,44,-0.3475
5,16,40,-0.33462
6,32,37,-0.32258
7,64,37,-0.31483
8,128,36,-0.31116
9,256,36,-0.31016
