In [2]:
# !pip install hmmlearn
# !pip install scikit-learn



In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from hmmlearn import hmm
import os
import glob
import pickle

In [4]:
##reading from the text file for observation sequnce. Only using 4.0 for now

def readInputFromFile(file_path):
    with open(file_path, "r") as file:
        lines = file.readlines()
        seqs = lines[0].split("K")
        words = " ".join(seqs).split(" ")
        string_id = {word: i for i, word in enumerate(set(words))}

        seqs = [s.split(" ") for s in seqs]
        ##convert words in seqs to numbers
        seqs = [[string_id[w] for w in s] for s in seqs]
        # lines = file.readlines()
        # print(len(lines))
        # string_to_id = {string: i for i, string in enumerate(set(lines[0]))}
        # raw_input = lines[0].split("K")
        # print(len(raw_input))
        # raw_string_pre_pad = [s.split(" ") for s in raw_input ]
        
        max_seq = max(len(s) for s in seqs)
        hmm_input = [np.pad(s, (0, max_seq - len(s)), constant_values=9999) for s in seqs]
        # print(hmm_input[0])
    return hmm_input

score_set = []

for s in ["3.5", "4.0", "0.5", "6.0"]:
    train_input = readInputFromFile("train/" + s + ".txt")
    test_input = readInputFromFile("test/" + s + ".txt")
    score_set.append((s, train_input, test_input))


In [5]:
def findBestModel(n_fits, N, train_input, test_input):
    best_score = best_model = None
    np.random.seed(13)

    for idx in range(n_fits):
        model = hmm.CategoricalHMM(
        n_components=N, random_state=idx,
        init_params='se')  # don't init transition, set it below
    # we need to initialize with random transition matrix probabilities
    # because the default is an even likelihood transition
    # we know transitions are rare (otherwise the casino would get caught!)
    # so let's have an Dirichlet random prior with an alpha value of
    # (0.1, 0.9) to enforce our assumption transitions happen roughly 10%
    # of the time

    ##A 
        epsilon = 1e-6
        transmat = np.full((N, N), 1 / N) + epsilon
        transmat /= transmat.sum(axis=1, keepdims=True)
    # print(A_matrix)
    # model.transmat_ = np.array([np.random.dirichlet([0.9, 0.1]),
    #                             np.random.dirichlet([0.1, 0.9])])
        model.transmat_ = transmat
        model.fit(train_input)
        score = model.score(test_input)
        print(f'Model #{idx}\tScore: {score}')
        if best_score is None or score > best_score:
            best_model = model
            best_score = score
    return best_score, best_model



In [24]:
# import concurrent.futures
# def findBestModel(n_fits, N, train_input, test_input):
#     num_workers = 4
#     best_score = best_model = None
#     np.random.seed(13)
#     with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
#         for idx in range(n_fits):
#             model = hmm.CategoricalHMM(
#             n_components=N, random_state=idx,
#             init_params='se') 
#             arguments = [train_input] * num_workers
#             futures = [executor.submit(model.fit, arg) for arg in arguments]
#             concurrent.futures.wait(futures)
#             result = [future.result() for future in futures]
#             epsilon = 1e-6
#             transmat = np.full((N, N), 1 / N) + epsilon
#             transmat /= transmat.sum(axis=1, keepdims=True)
#     # print(A_matrix)
#     # model.transmat_ = np.array([np.random.dirichlet([0.9, 0.1]),
#     #                             np.random.dirichlet([0.1, 0.9])])
#             model.transmat_ = transmat
#             model.fit(train_input)
#             score = model.score(test_input)
#             print(f'Model #{idx}\tScore: {score}')
#             if best_score is None or score > best_score:
#                 best_model = model
#                 best_score = score
#     return best_score, best_model
# s, train_input, test_input = score_set[0]
# findBestModel(10,5, train_input, test_input)

Model #0	Score: -4067214.0594240604
Model #1	Score: -4346488.444985692
Model #2	Score: -4206171.797621992


In [19]:
s, train_input, test_input = score_set[0]

In [7]:
import concurrent.futures
num_workers = 4
n_fits = 20
N = 10
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    for s, train_input, test_input in score_set:
        arguments = [(n_fits, N, train_input, test_input)] * num_workers
        futures = [executor.submit(findBestModel, *arg) for arg in arguments]
        concurrent.futures.wait(futures)
        result = [future.result() for future in futures]
        print(result)
        best_result = min(result, key=lambda x: x[0])
    lowest_best_score, lowest_best_model = best_result
    with open("model/" + s + " model" + str(lowest_best_score) + ".pkl", "wb") as file:
        pickle.dump(lowest_best_model, file)

Model #0	Score: -2911818.172335366
Model #0	Score: -2911818.172335366
Model #0	Score: -2911818.172335366
Model #0	Score: -2911818.172335366
Model #1	Score: -2914912.973114216
Model #1	Score: -2914912.973114216
Model #1	Score: -2914912.973114216
Model #1	Score: -2914912.973114216
Model #2	Score: -2889247.3034791513
Model #2	Score: -2889247.3034791513
Model #2	Score: -2889247.3034791513
Model #2	Score: -2889247.3034791513
Model #3	Score: -3230960.2550703688
Model #3	Score: -3230960.2550703688
Model #3	Score: -3230960.2550703688
Model #3	Score: -3230960.2550703688
Model #4	Score: -2894357.638273493
Model #4	Score: -2894357.638273493
Model #4	Score: -2894357.638273493
Model #4	Score: -2894357.638273493
Model #5	Score: -3236777.035898587
Model #5	Score: -3236777.035898587
Model #5	Score: -3236777.035898587
Model #5	Score: -3236777.035898587
Model #6	Score: -3095666.813885435
Model #6	Score: -3095666.813885435
Model #6	Score: -3095666.813885435
Model #6	Score: -3095666.813885435
Model #7	Sco

In [None]:
import pickle
with open("model" + str(best_score) + ".pkl", "wb") as file: 
    pickle.dump(best_model, file)

In [55]:
def generateHMMScoreOnEssay(model_path, essay_folder):
    ##read model
    model_list = []
    scores = [f for f in os.listdir(model_path) if os.path.isdir(os.path.join(model_path, f))]
    result = []
    for score in scores:
        model_file = glob.glob(os.path.join(model_path, score, '*.pkl'))[0]
        with open(model_file, "rb") as file: 
            model = pickle.load(file)
            model_list.append((score,model))
    scores = [f for f in os.listdir(essay_folder) if os.path.isdir(os.path.join(essay_folder, f))]
    for score in scores:
        text_files = glob.glob(os.path.join(essay_folder, score, '*.txt'))
        for text_file in text_files:
            parts = text_file.split('/')[-1].split(".")
            essay_id = parts[0]
            # print(f"Reading essay {essay_id}")
            hmm_input = readInputFromFile(text_file)
            hmm_result = [essay_id]
            for s, model in model_list:
                # print(f"Training essay {essay_id} against model-{s}")
                hmm_score = model.score(hmm_input)
                hmm_result.append(hmm_score)    
            # hmm_result.append(score)        
            result.append(hmm_result)
    return result
    
hmm_result = generateHMMScoreOnEssay("model","essay_hmm")
hmm_result_df = pd.DataFrame(hmm_result)
col = ['id', '6.0', '4.0', '0.5', '3.5']
hmm_result_df.columns = col
##6 4 .5 3.5

In [56]:
hmm_result_df.to_csv("essay_hmm_reult.csv")

In [58]:
asdf = pd.read_csv("essay_hmm_reult.csv")
asdf.head(10)

Unnamed: 0.1,Unnamed: 0,id,6.0,4.0,0.5,3.5
0,0,1ab08776,-40362.653518,-42101.252496,-19992.670827,-48030.299251
1,1,999246d1,-19349.691629,-33899.658801,-25516.363571,-27380.0064
2,2,af2374f2,-19834.012069,-33813.217533,-25908.687236,-26396.870311
3,3,869a7835,-17898.491125,-34751.671364,-25449.233747,-26112.786653
4,4,6a46d751,-33192.94569,-26779.961371,-27650.054977,-37159.989133
5,5,8b74c1d4,-22566.61265,-38314.502648,-29158.82188,-30505.557462
6,6,5f3386f6,-22899.412621,-37941.303786,-26667.821472,-26433.691674
7,7,f27ece1e,-44266.608307,-50768.740354,-29622.653347,-42358.133139
8,8,a6d90b13,-7551.315975,-18960.328383,-13554.150626,-12806.662457
9,9,79749297,-41403.334792,-35342.197515,-34158.783949,-47158.746431


In [22]:
with open("model/0.5/0.5 model-11845.397847443934.pkl", "rb") as file: 
    read_best_model = pickle.load(file)
    print(read_best_model)

CategoricalHMM(init_params='se', n_features=10000,
               random_state=RandomState(MT19937) at 0x7F6885B64640)


In [32]:

read_best_model.score(test_input)

[array([  36,   21,   21, ..., 9999, 9999, 9999]),
 array([   0,   36,   21, ..., 9999, 9999, 9999]),
 array([   0,   36,   21, ..., 9999, 9999, 9999]),
 array([   0,   36,   21, ..., 9999, 9999, 9999]),
 array([ 0, 36, 21, ..., 27, 25,  0]),
 array([   0,   36,   21, ..., 9999, 9999, 9999]),
 array([   0,   36,   21, ..., 9999, 9999, 9999]),
 array([   0,   36,   21, ..., 9999, 9999, 9999])]