In [83]:
import pandas as pd
import numpy as np
import csv
import random 
import seaborn as sns
import matplotlib.pyplot as plt

In [162]:
MIN_PRICE = 100
MAX_PRICE = 3000
PRICE_INCREMENT = 10
NUM_DAYS = 270

NUM_ACTIONS = 2    # buy (b) or wait (w)
POSSIBLE_PRICES = ((MAX_PRICE - MIN_PRICE) // PRICE_INCREMENT) + 1
NUM_STATES = NUM_DAYS * POSSIBLE_PRICES + 1  # + 1 for absorbing state 

EPSILON = 0.99
DISCOUNT_FACTOR = 0.99
LEARNING_RATE = 0.1 

NUM_ITERATIONS = 10000
NUM_MEGA_ITERATIONS = 270

In [163]:
Q = np.zeros((NUM_STATES, NUM_ACTIONS))

Made changes to these functions! I think the big issue was that there were not brackets around (p - MIN_PRICE) before the int division, so it was doing int division on MIN_PRICE and PRICE_INCREMENT only. Also, there are 291 possible prices (inclusive counting of 3000 and 100), so I changed that variable.

In [164]:
def encode_state(d, p):
    if d < NUM_DAYS: 
        return (d-1) * POSSIBLE_PRICES + ((p - MIN_PRICE) // PRICE_INCREMENT)
    else: 
        return NUM_STATES - 1   # Absorbing state -- last state 

def decode_state(s):
    if s < NUM_STATES - 1:  ## Not absorbing state
        relative_price = (s % POSSIBLE_PRICES)
        p = relative_price * PRICE_INCREMENT + MIN_PRICE
        d = ((s - relative_price) // POSSIBLE_PRICES) + 1
        return d, p 
    else:   ## Absorbing state
        return "Absorbing state"

In [165]:
def get_reward(s, a):
    if s == NUM_STATES - 1:     ## Absorbing state
        return 0    # No reward in absorbing state 
    d, p = decode_state(s)
    if a == 0: # Buy
        #return MAX_PRICE - p    # Higher reward for lower price 
        return (MAX_PRICE - p) // (PRICE_INCREMENT)
    else:   # Wait
        if d == NUM_DAYS - 1: 
            return -MAX_PRICE   # Large penalty for not buying a ticket by the last day
        else: 
            return -1 
            #return (-10 - d) // (PRICE_INCREMENT)      # Small penalty for waiting, increases as concert date approaches

To avoid the problem of never seeing later price/day combos (if you are always buying so early), I created this outer "mega iterations" loop, which allows you to start each price simulation from every day.

In [166]:
def QLearning(df):
    for j in range(NUM_MEGA_ITERATIONS):
        print(f"On iteration {j}")
        
        for i in range(NUM_ITERATIONS): 
            prices = df.iloc[[i]]   ## df of prices over 270 days from row i (therefore iteration i)
            
            ## CHANGE!!!!!!
            state = int(encode_state(j, prices.iloc[0, j]))  ## start state: day j with ticket price on that day
            
            while state != NUM_STATES - 1:  # Continue until absorbing state
                current_day, current_price = decode_state(state)
                #print("CURRENT DAY: ", current_day, "CURRENT PRICE: ", current_price)

                ## Epsilon greedy strategy for action selection: 
                if np.random.rand() < EPSILON: 
                    action = np.random.choice(NUM_ACTIONS)    # Explore: Random action
                else: 
                    action = np.argmax(Q[state, :])   # Exploit best action based on Q-value

                reward = get_reward(state, action)
                if action == 0 or current_day == NUM_DAYS - 1:  # If ticket is bought or it's the last day 
                    next_state = NUM_STATES - 1     # Move to absorbing state
                else: 
                    next_price = prices.iloc[0, current_day + 1]      ## extract from df 
                    next_state = encode_state(current_day + 1, next_price)

                ## Q-Learning update: 
                Q[state, action] = (1 - LEARNING_RATE) * Q[state, action] + LEARNING_RATE *(reward + DISCOUNT_FACTOR * np.max(Q[int(next_state), :]))

                state = int(next_state) 

    policy = [np.argmax(Q[s, :]) for s in range(NUM_STATES - 1)]
    #decoded_policy = [(decode_state(s), "B" if a == 0 else "W") for s, a in enumerate(policy)]
    decoded_policy = [] 
    for s in range(NUM_STATES - 1): 
        d, p = decode_state(s)
        a = policy[s]
        a_str = "B" if a == 0 else "W"
        decoded_policy.append((d, p, a_str))

    csv_file_path = '3ticket_buying_policy.csv'
    with open(csv_file_path, mode='w', newline='') as file:
        writer = csv.writer(file)

        # Write the header
        writer.writerow(['Day', 'Price', 'Action'])

        # Write the policy data
        for day, price, action in decoded_policy:
            writer.writerow([day, price, action])

    print(f"Policy saved to {csv_file_path}")

In [167]:
def get_data(fp): 
    df = pd.read_csv(fp)
    return df 

    #print(df)
    names = list(df.columns)
    row_0 = df.iloc[[0]]
    #print(row_0)
    row_0.to_csv("row_0.csv", index=False)

In [168]:
fp = "/Users/Emily/Desktop/CS238FinalProject/price_data.csv"

In [169]:
df = get_data(fp)
QLearning(df)

On iteration 0
On iteration 1
On iteration 2
On iteration 3
On iteration 4
On iteration 5
On iteration 6
On iteration 7
On iteration 8
On iteration 9
On iteration 10
On iteration 11
On iteration 12
On iteration 13
On iteration 14
On iteration 15
On iteration 16
On iteration 17
On iteration 18
On iteration 19
On iteration 20
On iteration 21
On iteration 22
On iteration 23
On iteration 24
On iteration 25
On iteration 26
On iteration 27
On iteration 28
On iteration 29
On iteration 30
On iteration 31
On iteration 32
On iteration 33
On iteration 34
On iteration 35
On iteration 36
On iteration 37
On iteration 38
On iteration 39
On iteration 40
On iteration 41
On iteration 42
On iteration 43
On iteration 44
On iteration 45
On iteration 46
On iteration 47
On iteration 48
On iteration 49
On iteration 50
On iteration 51
On iteration 52
On iteration 53
On iteration 54
On iteration 55
On iteration 56
On iteration 57
On iteration 58
On iteration 59
On iteration 60
On iteration 61
On iteration 62
On