# Project Details

### AIM : 
<b> To implement a language model which will perform better in playing the Hangman game as compared to the traditional models which depend on unigram,bigram and n-gram models</b>

#### Corpus Description
##### Training Corpus
<ul>
<li>Name: <b> train.txt.</b></li>
<li>Description: <b> Contains 753064 lowercase english words</b></li>
</ul>

##### Testing Corpus
<ul>
<li>Name: <b> test.txt.</b></li>
<li>Description: <b> Contains 11118 lowercase english words</b></li>
</ul>


In [None]:
import random
import time
words = []
start_time = time.time()
with open("train.txt", mode="r") as myFile:
    for line in myFile:
        try:
            words.append(line.strip().encode("utf-8"))
        except:
            pass

print "Reading the file takes %s seconds" % (time.time() - start_time)

In [None]:
#### Calculating alphabet probabilities in the list of words
"""
This function returns the probability s of a particular alphabet, calculated on the basis of 
its occurence in the word list passed it to it
"""
import operator
def count_alpha_occurence(list_of_words):
    start_time = time.time()
    freq_dict = [0]*26
    total = 0
    for word in list_of_words:
        for char in word:
            total += word.count(char)
            freq_dict[ord(char)-ord('a')] = freq_dict[ord(char)-ord('a')] + word.count(char)

    for i in xrange(len(freq_dict)):
        freq_dict[i] = float(freq_dict[i])/total
    print "Calculating alphabet freuency %s seconds" % (time.time() - start_time)
    return freq_dict    

In [None]:
#### Calculating the conditional probabilties of an alphabet
"""
 For a given a alphabet 'a' it returns a dictionary where key is 'b' and value is the probability of b 
 occuring in a word given that 'a' is present in the word. This is to exploit the fact that occurence 
 of an alphabet depends on the company it keeps.
"""
import string
def conditional_prob(words):
    start_time = time.time()
    letter_freq = {}
    prob = {}
    for i in string.lowercase:
        prob[i] = {}
        for word in words:
            if str(i) in word:
                if i in letter_freq:
                    letter_freq[i] +=1
                else:
                    letter_freq[i] = 1
                for char in word:
                    if char == i:
                        continue
                    if i in prob:
                        if char in prob[i]:
                            prob[i][char] += 1
                        else:
                            prob[i][char] = 1
                    else:
                        prob[i][char] = 1
    for key in prob.keys():
         for inner_key in prob[key].keys():
            prob[key][inner_key] = float(prob[key][inner_key])/letter_freq[key]
    print "Conditional Probability Calculation %s seconds" % (time.time() - start_time)
    return prob


In [None]:
#### Main function
"""
This function checks whether the guessed letter exists in the word and returns the number of
wrong guesses done to predict the word.
"""


guesses_for_this_word = []
def hangman(secret_word, guesser, max_mistakes=8, verbose=True):
    global guesses_for_this_word
    guesses_for_this_word = []
    secret_word = secret_word.lower()
    mask = ['_'] * len(secret_word)
    guessed = set()
    if verbose:
        print("Starting hangman game. Target is", ' '.join(mask), 'length', len(secret_word))
    
    mistakes = 0
    while mistakes < max_mistakes:
        if verbose:
            print("You have", (max_mistakes-mistakes), "attempts remaining.")
        guess = guesser(mask, guessed)

        if verbose:
            print('Guess is', guess)
        if guess in guessed:
            if verbose:
                print('Already guessed this before.')
            mistakes += 1
        else:
            try:
                guessed.add(guess)
            except:
                print(guessed,guess)
            if guess in secret_word:
                for i, c in enumerate(secret_word):
                    if c == guess:
                        mask[i] = c
                if verbose:
                    print('Good guess:', ' '.join(mask))
            else:
                if verbose:
                    print('Sorry, try again.')
                mistakes += 1
                
        if '_' not in mask:
            if verbose:
                print('Congratulations, you won.')
            return mistakes
        
    if verbose:
        print('Out of guesses. The word was', secret_word)
    guesses_for_this_word = list(guessed)
    return mistakes

In [None]:
def randomly(mask,guesses):
    char = numpy.random.choice(list(string.lowercase))
    while char in guesses:
        char = numpy.random.choice(list(string.lowercase))
    return char

In [None]:
def normalize(co_occ_dict):
    total = 0
    for key in co_occ_dict:
        total += co_occ_dict[key]
    for key in co_occ_dict:
        co_occ_dict[key] = co_occ_dict[key]/float(total)
    return co_occ_dict

In [None]:
import numpy
occurences = count_alpha_occurence(words)
co_occurences = conditional_prob(words)
prev_guess = 'a'

In [None]:
import random
def model_1(mask,guesses):
    global occurences,prev_guess
    if len(mask) == mask.count('_'):
        char = numpy.random.choice(list(string.lowercase),1,occurences)[0]
        while char in guesses:
            char = numpy.random.choice(list(string.lowercase),1,occurences)[0]
        prev_guess = char
        return char
    else:
        temp = normalize(co_occurences[prev_guess])
        temp = sorted(temp.items(), key=operator.itemgetter(1),reverse=True)
        cdf  = [(i, sum(p for j,p in temp if j < i)) for i,_ in temp]
        char = max(i for r in [random.random()] for i,c in cdf if c <= r)
        while char in guesses:
            char = max(i for r in [random.random()] for i,c in cdf if c <= r)
        prev_guess = char
        return char

In [None]:
def model_2(mask,guesses):
    global occurences,prev_guess
    copy = occurences[:]
    if len(mask) == mask.count('_'):
        temp = copy.index(max(copy))
        copy[temp] = -1
        char = chr(temp+97)
        while char in guesses:
            temp = copy.index(max(copy))
            copy[temp] = -1
            char = chr(temp+97)
        prev_guess = char
        return char
    else:
        temp = normalize(co_occurences[prev_guess])
        temp = sorted(temp.items(), key=operator.itemgetter(1),reverse=True)
        cdf  = [(i, sum(p for j,p in temp if j < i)) for i,_ in temp]
        char = max(i for r in [random.random()] for i,c in cdf if c <= r)
        while char in guesses:
            char = max(i for r in [random.random()] for i,c in cdf if c <= r)
        prev_guess = char
        return char
hangman('anaconda',model_2,verbose=True)

In [None]:
test_words = []
start_time = time.time()
with open("test.txt", mode="r") as myFile:
    for line in myFile:
        try:
            test_words.append(line.strip().encode("utf-8"))
        except:
            pass

print "Reading the file takes %s seconds" % (time.time() - start_time)

In [None]:
import json
mistakes = 0
correct = 0
performance = dict()
start_time = time.time()
for word in test_words:
    temp= hangman(word,randomly,verbose=False)
    if temp < 8:
        correct += 1
    else:
        mistakes += temp
    performance[word] = guesses_for_this_word

print "Testing takes %s seconds" % (time.time() - start_time)
print "Correctly guessed words " , correct

f = open('random.json','w')
json.dump(performance,f)
f.close()

In [None]:
mistakes = 0
correct = 0
performance = dict()
start_time = time.time()
for word in test_words:
    temp= hangman(word,model_1,verbose=False)
    if temp < 8:
        correct += 1
    else:
        mistakes += temp
    performance[word] = guesses_for_this_word

print "Testing takes %s seconds" % (time.time() - start_time)
print "Correctly guessed words " , correct

f = open('model1.json','w')
json.dump(performance,f)
f.close()

In [None]:
mistakes = 0
correct = 0
performance = dict()
start_time = time.time()
for word in test_words:
    temp= hangman(word,model_2,verbose=False)
    if temp < 8:
        correct += 1
    else:
        mistakes += temp
    performance[word] = guesses_for_this_word

print "Testing takes %s seconds" % (time.time() - start_time)
print "Correctly guessed words " , correct

f = open('model_2.json','w')
json.dump(performance,f)
f.close()

### Personal Details

<ol>
    <li>Name: <b>G V Sandeep</b></li>
    <li>College: <b>BITS - Pilani, Hyderabad Campus</b></li>
    <li>Github: <a href="https://github.com/greetsandeep/">greetsandeep</a></li>
</ol>

This code is open sourced and can be found at : <a href"https://github.com/greetsandeep/ACM_SummerSchool/tree/master/Improvising%20Hangman">Improvising Hangman</a>