In [10]:
import gym
import time
import numpy as np
import pandas as pd
from data_structure import TrainingData
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")


## Training:

In [54]:
NUM_TRIES = 200

td = TrainingData()

env = gym.make('CartPole-v1')
env.action_space.seed(42)

observation, info = env.reset(seed=42, return_info=True)

reward_sum = 0

start = time.time()

_try = 0
pbar = tqdm(total=NUM_TRIES)
while _try < NUM_TRIES:
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action=action)
    reward_sum += reward

    td.add_step(observation=observation, action=action)
    
    if done:
        observation, info = env.reset(return_info=True)
        td.close_try(reward_sum, _try)
        reward_sum = 0
        _try += 1
        pbar.update(1)
pbar.close()
env.close()

print(f"Elapsed data gathering time: {time.time() - start}")

100%|██████████| 200/200 [00:01<00:00, 130.89it/s]

Elapsed data gathering time: 1.5360040664672852





In [55]:
data = td.get_training_data(as_dataframe=True)

# Model

In [56]:
from sklearn.neural_network import MLPClassifier

In [76]:
model = MLPClassifier(hidden_layer_sizes=(100, 100), random_state=42)

In [81]:
for _try in tqdm(range(50)):
    chunk = data[data['Try Index'] == _try]
    observation_arr = []
    action_arr = []
    for i in range(len(chunk)-1):
        observation_arr.append(list(chunk.iloc[i][['Cart Position', 'Cart Velocity', 'Pole Angle', 'Pole Angular Velocity']]))
        error_now = np.abs(chunk.iloc[i]['Pole Angle'])
        error_after = np.abs(chunk.iloc[i+1]['Pole Angle'])
        action = chunk.iloc[i]['Action']
        if error_after > error_now:
            action = 1.0 if action == 0.0 else 0.0
        action_arr.append(action)
    model.fit(observation_arr, action_arr)

100%|██████████| 50/50 [02:12<00:00,  2.64s/it]


In [83]:
td2 = TrainingData()

env = gym.make('CartPole-v1')
env.action_space.seed(42)

observation, info = env.reset(seed=42, return_info=True)

reward_sum = 0

start = time.time()

done = False
t = 0
while t < 1000 or not done:
    action = int(model.predict([observation])[0])
    observation, reward, done, info = env.step(action=action)
    reward_sum += reward

    td2.add_step(observation=observation, action=action)
    
    if done:
        observation, info = env.reset(return_info=True)
        td2.close_try(reward_sum, _try)
        reward_sum = 0

    t += 1
        
env.close()

print(f"Elapsed data gathering time: {time.time() - start}")

Elapsed data gathering time: 0.5280027389526367


In [84]:
result = td2.get_training_data(as_dataframe=True)

In [85]:
result

Unnamed: 0,Cart Position,Cart Velocity,Pole Angle,Pole Angular Velocity,Action,Reward,Try Index
0,0.027273,0.188478,0.036255,-0.261420,1.0,35.0,49.0
1,0.031043,0.383064,0.031026,-0.542451,1.0,35.0,49.0
2,0.038704,0.577736,0.020177,-0.825199,1.0,35.0,49.0
3,0.050259,0.382344,0.003673,-0.526239,0.0,35.0,49.0
4,0.057906,0.187171,-0.006852,-0.232401,0.0,35.0,49.0
...,...,...,...,...,...,...,...
24766,0.125764,0.633944,-0.131339,-1.108228,1.0,19.0,49.0
24767,0.138443,0.440770,-0.153503,-0.859465,0.0,19.0,49.0
24768,0.147258,0.637612,-0.170693,-1.196209,1.0,19.0,49.0
24769,0.160011,0.445060,-0.194617,-0.961519,0.0,19.0,49.0


In [86]:
result['Reward'].describe()

count    24771.000000
mean        21.197691
std          5.233579
min         13.000000
25%         17.000000
50%         19.000000
75%         22.000000
max         38.000000
Name: Reward, dtype: float64