In [20]:
import random
import cv2
import math
import numpy as np
import pandas as pd
from konlpy.tag import Twitter


## Set initial settings of EM algorithm

In [21]:
def get_em_idx(parameters, sentence):
    num=0
    for parameter in parameters:
        num*=2
        plus=0
        for param in parameter:
            if param in sentence:
                plus=1
        num+=plus
    return num

In [22]:
def make_em_tables(parameters, training_sample):
    
    ## probability_init.csv
    probability_init = [ 
        {"condition":parameter[0],"probability_from_H":0.5+(random.random()-0.5)/5,"probability_from_NH":0.5+(random.random()-0.5)/5} for parameter in parameters 
    ]
    probability_init_pd = pd.DataFrame(probability_init)
    probability_init_pd.to_csv("probability_init.csv")

    
    ## ptable_init.csv
    ptable_init = []
    for idx in range(2**len(parameters)):
        num=idx
        checks=[]
        data={}

        for i in range(len(parameters)):
            checks.append(num%2)
            num=int((num-num%2)/2)

        for i in range(len(parameters)):
            data[parameters[i][0]]=checks[len(parameters)-1-i]
        data["counts_H"]=0
        data["counts_NH"]=0
        data["probability_to_H"]=0.5
        data["probability_to_NH"]=0.5
        ptable_init.append(data)
        
        
    ## Update information of training data to ptable_init.csv
    sample = pd.read_csv(training_sample, "\t")
    sample_document = sample['document']
    sample_label = sample['label']

    for idx in range(sample.shape[0]):
        num=0
        # remove nan
        if sample_document[idx] == sample_document[idx]:
            num=get_em_idx(parameters, sample_document[idx])

        if sample_label[idx] == 1:
            ptable_init[num]['counts_H']+=1
        else:
            ptable_init[num]['counts_NH']+=1
    
    for idx in range(len(ptable_init)):
        psum=ptable_init[idx]['counts_H']+ptable_init[idx]['counts_NH']
        if psum==0:
            continue
        ptable_init[idx]['probability_to_H']=ptable_init[idx]['counts_H']/psum
        ptable_init[idx]['probability_to_NH']=ptable_init[idx]['counts_NH']/psum
    
    ptable_init_pd = pd.DataFrame(ptable_init)
    ptable_init_pd.to_csv("ptable_init.csv")
    
    return probability_init, ptable_init

In [23]:
def train_em_model(parameters, probability, ptable, iteration):
    ptable_lookup = pd.DataFrame(ptable)
    ptable_counts = pd.DataFrame(ptable)["counts_H"]

    H=0.5+(random.random()-0.5)/5
    for it in range(iteration):
        # update ptable
        for idx in range(len(ptable)):
            num=idx
            hsum=1
            nhsum=1
            for i in range(len(parameters)):
                if num%2 == 1:
                    hsum*=probability[i]['probability_from_H']
                    nhsum*=probability[i]['probability_from_NH']
                else:
                    hsum*=1-probability[i]['probability_from_H']
                    nhsum*=1-probability[i]['probability_from_NH']
                num=int((num-num%2)/2)
            ptable[idx]['probability_to_H']=hsum/(hsum+nhsum)
            ptable[idx]['probability_to_NH']=1-hsum/(hsum+nhsum)

        
        # update probability
        ptable_p = pd.DataFrame(ptable)["probability_to_H"]
        ptable_np = pd.DataFrame(ptable)["probability_to_H"]
        
        H=(ptable_counts*ptable_p).sum()/ptable_counts.sum()
        for idx in range(len(parameters)):
            parameter=parameters[idx]
            param=parameter[0]
            idxs = np.where(ptable_lookup[param]==1)[0].astype(int)
            
            probability[idx]['probability_from_H'] = (ptable_counts[idxs]*ptable_p[idxs]).sum()/ptable_counts[idxs].sum()
            probability[idx]['probability_from_NH'] = (ptable_counts[idxs]*ptable_np[idxs]).sum()/ptable_counts[idxs].sum()
        
        # scoring to know when to stop training
        print("iteration : " + str(it) + "\tscore: " + str(score_em_model(parameters, ptable_p, "./ratings_data/ratings_valid.txt")))

    ptable_pd = pd.DataFrame(ptable)
    ptable_pd.to_csv("ptable_trained.csv")
    probability_pd = pd.DataFrame(probability)
    probability_pd.to_csv("probability_trained.csv")
    return probability, ptable

In [24]:
def score_em_model(parameters, ptable_p, validation_sample):
    
    # load validation data
    sample = pd.read_csv(validation_sample, "\t")
    sample_document = sample['document']
    sample_label = sample['label']
    
    # scoring
    score = 0
    for idx in range(len(sample_label)):
        sentence = sample_document[idx]
        label=1
        if sentence == sentence:
            num=get_em_idx(parameters, sentence)
            if ptable_p[num]<0.5:
                label=0
        if sample_label[idx] == label:
            score+=1
    
    return score*100/len(sample_label)

In [None]:
parameters=[
    ["재미", "재밋", "재밌", "꿀잼"], 
    ["감동", "슬프", "슬퍼", "슬픔", "멋", "멋지", "멋짐", "멋져"],
    ["욕", "ㅅㅂ", "노잼", "별로", "아니"]
]
training_sample = "./ratings_data/ratings_train.txt"

probability, ptable = make_em_tables(parameters, training_sample)

In [None]:
trained_probability, trained_ptable = train_em_model(parameters, probability, ptable, 10)

In [None]:
validation_sample = "./ratings_data/ratings_valid.txt"
ptable_p = pd.read_csv("ptable_init.csv")['probability_to_H']

score = score_em_model(parameters, ptable_p, validation_sample)
print(score)