# Project 2b

In [1]:
from __future__ import print_function
import time, os, json
import numpy as np
import matplotlib.pyplot as plt
import sys
import networkx as nx
import pandas
import collections
from math import lgamma

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [60]:
mediumDf = pandas.read_csv('medium3.csv')

In [78]:
states = range(0,50000)
actions = sorted(mediumDf.a.unique())
stateprimes = mediumDf.sp.unique()

In [4]:
rewards = mediumDf.r.unique()

In [5]:
rewards

array([  -225,  99775,    -25,  99975,      0,   -100,  99900, 100000])

In [None]:
stateprimes

In [16]:
# R = {}
# for s in states:
#     R[s] = {}
# for row in mediumDf.itertuples():
#     if row.a not in R[row.s]:
#         R[row.s][row.a] = []
#     R[row.s][row.a].append(row.r)

In [26]:
# R[25740]

{2: [-100, -100, -100],
 3: [-25, -25, -25, -25, -25, -25],
 4: [0],
 5: [-25, -25, -25],
 6: [-100, -100],
 7: [-225]}

In [87]:
reward_mask = mediumDf['r'] >= 100000
state_mask = mediumDf['s'] >= 30000
max_rewards = mediumDf[reward_mask & state_mask].groupby(['s', 'a', 'r']).size().reset_index(name='counts').sort_values(by=['s'])

In [92]:
max_reward_states = max_rewards.s.unique()

In [93]:
max_reward_states

array([30957, 30959, 30960, 30961, 31457, 32959, 33456])

In [89]:
# create reward dict by overwriting standard reward when we have a larger reward
def get_reward_map(max_rewards):
    reward = {}
    for s in states:
        reward[s] = {1:-225, 2:-100, 3:-25, 4:0, 5:-25, 6:-100, 7:-225}

    for row in max_rewards.itertuples():
        reward[row.s][row.a] = row.r
    return reward

In [102]:
action_reward_map = {1:-225, 2:-100, 3:-25, 4:0, 5:-25, 6:-100, 7:-225}
def get_reward(state, action):
    if state in max_reward_states:
        return 100000
    else:
        return action_reward_map[action]

In [20]:
actions

[1, 2, 3, 4, 5, 6, 7]

In [35]:
sasp_counts = mediumDf.groupby(['s', 'a', 'sp']).size().reset_index(name='counts')

In [97]:
def calc_T(df, states, actions):
    T = {}
    for s in states:
        T[s] = {}

    for row in df.itertuples():
        if row.a not in T[row.s]:
            T[row.s][row.a] = {}
        T[row.s][row.a][row.sp] = row.counts
    return T

In [98]:

def calc_T_prob(T, states, actions):
    T_prob = {}
    for s in states:
        T_prob[s] = {}
        for a in actions:
            T_prob[s][a] = collections.defaultdict(lambda: 0)
            if a not in T[s]:
                continue
#             if a not in T[s]:
#                 for n in [1,-1,2,-2,3,-3,4,-4]:
#                     if (s+n) in T and a in T[s+n]:
#                         T[s][a] = T[s+n][a]
#                 if a not in T[s]:
#                     print("No neighbors of ", s," had ", a)
#                     continue

            sum_val = sum(T[s][a].values())
            poss_states = T[s][a].keys()
            for sp in poss_states:
                T_prob[s][a][sp] = T[s][a][sp]/sum_val
    return T_prob

In [99]:
T = calc_T(sasp_counts, states, actions)
T_prob = calc_T_prob(T, states, actions)

In [39]:
pos_reward_df = mediumDf[mediumDf['r'] > 0].groupby(['r', 'sp']).size().reset_index(name='counts')

In [119]:
def policy_iter(states, actions, T_prob, iters=10):
    policy = {}
    U = {}
    for s in states:
        U[s] = 0
    for i in range(iters):
        for s in states:
            action_results = []
            for a in actions:
                sp_util = []
                for sp in T_prob[s][a].keys():
                    if sp in T_prob[s][a] and sp in U:
                        sp_util.append(T_prob[s][a][sp]*U[sp])
                action_results.append(get_reward(s,a) + sum(sp_util))
            U[s] = max(action_results)
            policy[s] = np.argmax(action_results)+1 ## actions are 1-based
    return policy, U

In [120]:
policy, utility = policy_iter(states, actions, T_prob, 1)

In [121]:
len(policy.keys())

50000

In [106]:
len(utility.keys())

50000

In [107]:
len(states)

50000

In [122]:
def save_policy(policy, filename):
    policy_sorted = []
    for s in states:
        policy_sorted.append(str(policy[s]))
    policy_string = '\n'.join(policy_sorted)
    with open(filename, 'w') as f:
        f.write(policy_string)
        f.write('\n4')

In [123]:
save_policy(policy, 'medium2.policy')