# Project 2

In [2]:
from __future__ import print_function
import time, os, json
import numpy as np
import matplotlib.pyplot as plt
import sys
import networkx as nx
import pandas
from math import lgamma

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [3]:
smallDf = pandas.read_csv('small2.csv')

In [4]:
list(smallDf)

['s', 'a', 'r', 'sp']

In [5]:
smallDf

Unnamed: 0,s,a,r,sp
0,12,3,0,22
1,22,4,0,12
2,12,2,0,11
3,11,3,0,21
4,21,1,0,21
5,21,4,0,11
6,11,3,0,1
7,1,3,0,11
8,11,4,0,12
9,12,3,0,11


In [16]:
states = smallDf.s.unique()
actions = [1,2,3,4]

In [None]:
g=nx.Graph()
for s in states:
    g.add_node(s, counts={1:0,2:0,3:0,4:0}, actions={})

In [63]:
df = smallDf.groupby(['s', 'a', 'sp']).size().reset_index(name='counts')

In [119]:
def calc_T(df, states):
    T = {}
    for s in states:
        T[s] = {}

    for row in df.itertuples():
        if row.a not in T[row.s]:
            T[row.s][row.a] = {}
        T[row.s][row.a][row.sp] = row.counts
    return T

In [120]:
def calc_T_prob(T, states, actions):
    T_prob = {}
    for s in states:
        T_prob[s] = {}
        for a in actions:
            T_prob[s][a] = {}
            sum_val = sum(T[s][a].values())
            poss_states = T[s][a].keys()
            for sp in poss_states:
                T_prob[s][a][sp] = T[s][a][sp]/sum_val
    return T_prob

In [121]:
T = calc_T(df, states)
T_prob = calc_T_prob(T, states, actions)

In [66]:
rdf = smallDf[smallDf['r'] > 0].groupby(['r', 'sp']).size().reset_index(name='counts')

In [67]:
rdf

Unnamed: 0,r,sp,counts
0,3,13,127
1,3,22,119
2,3,24,123
3,3,33,126
4,10,68,104
5,10,77,139
6,10,79,111
7,10,88,123


In [77]:
R = {}
for s in states:
    R[s] = 0
for row in rdf.itertuples():
    R[row.sp] = row.r

In [122]:
discount = 0.95
    
def policy_iter(iters=10):
    policy = {}
    U = {}
    for s in states:
        U[s] = 0
    for i in range(iters):
        for s in states:
            action_results = []
            for a in actions:
                sp_util = []
                for sp in T_prob[s][a].keys():
                    #print(s,a,sp)
                    #print("u", U[sp])
                    #print("t", T_prob[s][a][sp])
                    sp_util.append(T_prob[s][a][sp]*U[sp])
                action_results.append(sum(sp_util))
            U[s] = R[s] + discount*max(action_results)
            policy[s] = np.argmax(action_results)+1
    return policy, U

In [125]:
policy, utility = policy_iter(5)

In [126]:
policy

{1: 3,
 2: 2,
 3: 3,
 4: 1,
 5: 3,
 6: 1,
 7: 1,
 8: 2,
 9: 2,
 10: 3,
 11: 2,
 12: 2,
 13: 3,
 14: 3,
 15: 3,
 16: 3,
 17: 3,
 18: 3,
 19: 3,
 20: 3,
 21: 2,
 22: 4,
 23: 1,
 24: 3,
 25: 1,
 26: 3,
 27: 3,
 28: 3,
 29: 3,
 30: 3,
 31: 4,
 32: 4,
 33: 4,
 34: 1,
 35: 3,
 36: 3,
 37: 3,
 38: 3,
 39: 3,
 40: 3,
 41: 4,
 42: 4,
 43: 4,
 44: 2,
 45: 3,
 46: 3,
 47: 3,
 48: 3,
 49: 3,
 50: 1,
 51: 2,
 52: 2,
 53: 2,
 54: 2,
 55: 2,
 56: 3,
 57: 3,
 58: 3,
 59: 3,
 60: 1,
 61: 2,
 62: 2,
 63: 2,
 64: 2,
 65: 2,
 66: 2,
 67: 2,
 68: 2,
 69: 3,
 70: 1,
 71: 2,
 72: 4,
 73: 4,
 74: 2,
 75: 2,
 76: 2,
 77: 2,
 78: 4,
 79: 1,
 80: 1,
 81: 4,
 82: 4,
 83: 4,
 84: 2,
 85: 4,
 86: 4,
 87: 2,
 88: 4,
 89: 4,
 90: 1,
 91: 4,
 92: 4,
 93: 4,
 94: 2,
 95: 2,
 96: 2,
 97: 2,
 98: 4,
 99: 1,
 100: 4}

In [116]:
def save_policy(policy, filename):
    policy_sorted = []
    for s in states:
        policy_sorted.append(str(policy[s]))
    policy_string = '\n'.join(policy_sorted)
    with open(filename, 'w') as f:
        f.write(policy_string)

In [117]:
save_policy(policy, 'small.policy')