### Guide to this File:

This file is intended to be ran from other notebooks. This file will only return time_series for one hero to reduce workload and help with troubleshooting.

This file will create the following objects:

df : match data [match_id, hero_id, gold_t]
df_no_match : match data without match_id [hero_id, gold_t]

missing_indices : indices that had missing gold_t values and were removed, use with print() to view
df.loc[missing_indices, ['hero_id','gold_t']] : to be used to view missing_indices

zero_length_indices : the indices containing zero length gold_t values and were removed
zero_length_records : the records containing zero length gold_t values and were removed




In [2]:
import json
import pandas as pd
import requests
from numpy import array
import numpy as np

import torch
import torch.nn
import torch.optim as optim

# Used in LTSMModel Class Instantiation
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# Local Load

#file_path = "C:\\Users\\dcrai\\source\\repos\\DATA698\\Code\\Data\\data.json"
#file_path_hero = "C:\\Users\\dcrai\\source\\repos\\DATA698\\Code\\Data\\hero_id_table.csv"
#file_csv = "C:\\Users\\dcrai\\source\\repos\\DATA698\\Code\\Data\\iter_1.csv" - different match data

#with open(file_path, 'r') as file:
    #data = json.load(file)

# Github Load

file_path = "https://raw.githubusercontent.com/d-ev-craig/DATA698/main/Code/Data/data.json"
file_path_hero = "https://raw.githubusercontent.com/d-ev-craig/DATA698/main/Code/Data/hero_id_table.csv"

response = requests.get(file_path)
data = json.loads(response.text)

# Create heroes dataframe
heroes= pd.read_csv(file_path_hero)

# Extract 'match_id', 'hero_id', and 'gold_t' from each element in 'data'
match_ids = [element['match_id'] for element in data]
hero_ids = [element['hero_id'] for element in data]
gold_t_values = [element['gold_t'] for element in data]
radiant_team = [element['radiant_team'] for element in data]
dire_team = [element['dire_team'] for element in data]
radiant_win = [element['win'] for element in data]
lane = [element['lane'] for element in data]
lane_roles = [element['lane_role'] for element in data]

# Create match data dataframe
df = pd.DataFrame({'match_id': match_ids, 'hero_id': hero_ids, 'gold_t': gold_t_values})
df2 = pd.DataFrame({'match_id': match_ids, 'hero_id': hero_ids, 'lane' : lane, 'lane_role': lane_roles, 'gold_t': gold_t_values, 'radiant_team' : radiant_team, 'dire_team' : dire_team, 'radiant_win' : radiant_win})



In [4]:
#print(df['match_id'].tolist.unqiue)

In [5]:
#def add_positions_column(df):
#  unique_matches = df['match_id'].unique().tolist()


In [6]:
#data

In [7]:
#df2[df2['match_id'] == 7517376613]

In [8]:
# Match ID example
#7517376613

# Checking zero length tensors

#df.iloc[2470]
#df.iloc[2511]

### Missing Data Drop

In [9]:
def view_null_indexes(df):

  missing_mask = df['gold_t'].isnull()
  missing_indices = df.index[missing_mask]
  #print(missing_indices)

  return df.loc[missing_indices, ['hero_id','gold_t']]

print(view_null_indexes(df))


df = df.dropna(subset=['gold_t'])
df = df.reset_index(drop=True) # Reset the indexes so no issues arise with using index locations to drop 0 length tensors


### df2 missing drop
print(view_null_indexes(df2))

df2 = df2.dropna(subset=['gold_t'])
df2 = df2.reset_index(drop=True)

      hero_id gold_t
2270       20   None
2271      120   None
2272       69   None
2273       73   None
2274       26   None
...       ...    ...
6915      136   None
6916      106   None
6917      137   None
6918       14   None
6919      112   None

[76 rows x 2 columns]
      hero_id gold_t
2270       20   None
2271      120   None
2272       69   None
2273       73   None
2274       26   None
...       ...    ...
6915      136   None
6916      106   None
6917      137   None
6918       14   None
6919      112   None

[76 rows x 2 columns]


### Zero Length Tensors Drop

If we don't drop the zero length tensors, when we call `model(hero_ids,time_series)` in the training loop, it will error out when attempting to run pad_packed_sequence

We are also dropping any games with less than 11 minutes, since the model is trained using a 10 minute lookback window.


### Games <15 minutes

Since the model performs drastically better on games lasting more than 10 minutes, we will remove games less than 11 and since we will be creating a multi-step prediction with 5 steps, we will restrict games to be atleast 15 minutes long.

In [10]:
def drop_zero_length_indices(df):
  zero_length_indices = []

  for index, time_series in enumerate(df['gold_t']):
      if len(time_series) < 16:
          zero_length_indices.append(index)


  if len(zero_length_indices) > 0:
      print(f"Found {len(zero_length_indices)} tensors with length 0 at indices: {zero_length_indices}")
  else:
      print("No tensors with length 0 found.")

  zero_length_records = df.loc[zero_length_indices]
  df = df.drop(zero_length_indices).reset_index(drop = True)

  return df


df = drop_zero_length_indices(df)
df2 = drop_zero_length_indices(df2)

Found 194 tensors with length 0 at indices: [230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1390, 1391, 1392, 1393, 1394, 1395, 1396, 1397, 1398, 1399, 2460, 2501, 2512, 2573, 2574, 2575, 2576, 2577, 2578, 2579, 2580, 2581, 2582, 2583, 2834, 2835, 2836, 2837, 2838, 2839, 2840, 2841, 2842, 2843, 3074, 3075, 3076, 3077, 3078, 3079, 3080, 3081, 3082, 3083, 3144, 3145, 3146, 3147, 3148, 3149, 3150, 3151, 3152, 3153, 3374, 3375, 3376, 3377, 3378, 3379, 3380, 3381, 3382, 3383, 3494, 3495, 3496, 3497, 3498, 3499, 3500, 3501, 3502, 3503, 3514, 3515, 3516, 3517, 3518, 3519, 3520, 3521, 3522, 3523, 3574, 3575, 3576, 3577, 3578, 3579, 3580, 3581, 3582, 3583, 3664, 3665, 3666, 3667, 3668, 3669, 3670, 3671, 3672, 3673, 4364, 4365, 4366, 4367, 4368, 4369, 4370, 4371, 4372, 4373, 5334, 5335, 5336, 5337, 5338, 5339, 5340, 5341, 5342, 5343, 5414, 5415,

In [39]:
match_count_df2 = df2.groupby('hero_id').size().reset_index(name='match_count')
match_count_df2['count_70pct'] = (match_count_df2['match_count']*.70).round().astype(int)
match_count_df2.sort_values('match_count', ascending=True)

Unnamed: 0,hero_id,match_count,count_70pct
97,99,4,3
32,34,6,4
106,108,6,4
109,111,13,9
75,77,15,10
...,...,...,...
104,106,229,160
20,21,230,161
84,86,241,169
24,26,263,184


In [50]:
df2_70 = pd.DataFrame(columns=['match_id','hero_id','lane','lane_role','gold_t','radiant_team','dire_team','radiant_win'])

for hero_id in match_count_df2['hero_id']:
  num_matches = match_count_df2[match_count_df2['hero_id'] == hero_id]['count_70pct'].values[0]
  df2_70_sub = df2[df2['hero_id'] == hero_id].sample(n = num_matches)
  df2_70 = pd.concat([df2_70,df2_70_sub],ignore_index = True)

# Get the index of the selected rows
selected_index = df2_70.index
# Get the rows that were not selected
df2_70_remain = df2.drop(selected_index)

print(len(df2_70))
print(len(df2_70_remain))



6924
2976


In [12]:
def build_hero_ts_df(df, list_length='Longest'):
    df_allhero = pd.DataFrame(columns=['match_id', 'hero_id', 'gold_t'])
    hero_ids = sorted(df['hero_id'].unique().tolist())
    df_remain = df

    for hero_id in hero_ids:
        if list_length == 'Longest':
            idx = df[df['hero_id'] == hero_id]['gold_t'].apply(len).idxmax()
        elif list_length == 'Average':
            idx = (df[df['hero_id'] == hero_id]['gold_t'].apply(len) -
                   df[df['hero_id'] == hero_id]['gold_t'].apply(len).mean().round().astype(int)).abs().idxmin()
        else:
            raise ValueError("Invalid list_length parameter. Choose either 'Longest' or 'Average'.")

        df_remain = df_remain.drop([idx])
        df_onets = df.iloc[idx].to_frame().T
        df_allhero = pd.concat([df_allhero, df_onets], axis=0, ignore_index=True).sort_values(by='hero_id', ascending=True)

    return df_allhero, hero_ids, df_remain

df_allhero, hero_ids, df_all_remain = build_hero_ts_df(df)
df_allhero_avglen, hero_ids_avglen, df_avg_remain = build_hero_ts_df(df, 'Average')

df2_allhero, hero_ids2, df2_all_remain = build_hero_ts_df(df2)
df2_allhero_avglen, hero_ids2_avglen, df2_avg_remain = build_hero_ts_df(df2, 'Average')

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
import os
os.chdir("/content/drive/MyDrive/Colab Notebooks/698/data/10step_5horizon")

In [15]:
hero_ids = pd.DataFrame(hero_ids, columns = ['hero_id'])
hero_ids2 = pd.DataFrame(hero_ids2, columns = ['hero_id'])

In [16]:
df_allhero.to_csv('df_allhero.csv', index=False)
hero_ids.to_csv('hero_ids.csv', index=False)
df_all_remain.to_csv('df_all_remain.csv', index=False)

df2_allhero.to_csv('df2_allhero.csv', index=False)
hero_ids2.to_csv('hero_ids2.csv', index=False)
df2_all_remain.to_csv('df2_all_remain.csv', index=False)

Unnamed: 0,match_id,hero_id,gold_t,lane,lane_role,radiant_team,dire_team,radiant_win
0,7524401509,1,"[60, 308, 617, 973, 1441, 1682, 1932, 2216, 25...",3.0,1.0,"[13, 10, 100, 30, 138]","[1, 25, 21, 112, 20]",False
1,7548252414,2,"[0, 205, 448, 831, 1235, 1491, 1780, 2272, 265...",3.0,3.0,"[67, 2, 22, 20, 40]","[8, 97, 43, 27, 121]",True
2,7559833511,3,"[0, 170, 285, 375, 505, 595, 706, 967, 1147, 1...",3.0,1.0,"[48, 76, 97, 65, 110]","[25, 13, 102, 26, 3]",False
3,7600037606,4,"[0, 287, 666, 1028, 1313, 1533, 1794, 2239, 25...",1.0,1.0,"[4, 129, 31, 126, 27]","[110, 86, 41, 34, 96]",True
4,7611209103,5,"[0, 302, 437, 652, 806, 1027, 1389, 1670, 1805...",3.0,1.0,"[44, 62, 28, 121, 85]","[109, 120, 101, 5, 129]",False
...,...,...,...,...,...,...,...,...
119,7632649002,129,"[0, 207, 592, 943, 1275, 1787, 2078, 2336, 279...",1.0,3.0,"[110, 19, 137, 41, 121]","[13, 70, 129, 9, 79]",False
120,7525812113,135,"[62, 343, 699, 898, 1185, 1517, 1855, 2670, 31...",1.0,3.0,"[35, 69, 26, 83, 22]","[6, 136, 135, 90, 20]",True
121,7519118314,136,"[0, 230, 320, 410, 592, 859, 1012, 1102, 1603,...",3.0,3.0,"[53, 51, 136, 20, 10]","[80, 112, 83, 23, 48]",False
122,7611956203,137,"[0, 323, 584, 912, 1203, 1529, 1799, 1979, 215...",2.0,2.0,"[28, 105, 20, 113, 120]","[72, 137, 63, 23, 121]",True


### Hero_ID Filter

In [17]:
#df_no_match['hero_id'].value_counts()

In [18]:
#hero_filter_value = df_no_match['hero_id'].value_counts().idxmax()
#.value_counts() - counts the number of instances each value of hero_id, sorts descending by default
# Important to note that .value_counts replaces the index with the value of the hero_id, meaning we need to pull that index value for our hero to filter by
#.idxmax() - selects the index value that contains the highest value

#df_hero_filtered = df_no_match[df_no_match['hero_id'] == hero_filter_value].reset_index(drop=True)
#df_20 = df_hero_filtered['gold_t']

In [19]:
#df_hero_filtered.iloc[0][['gold_t']]
#df_20

In [20]:
#type(df_20[0])

# Convert each row from a list to a pandas Series
#df_20_series = df_20.apply(lambda x: pd.Series(x))
#df_20_series

In [21]:
#df_20_series.iloc[:, 9:13]

Below determines the maximum gold seen in all the games to be used as a a value to scale on. We do this so that the data is relatively scaled to an expected new game. The current scaling uses the complete time-series from the match to scale, but the issue in alive scenario is that we won't have the entire timeseries dataset in a live match, so instead we scale it in accordance to the largest value we've seen across matches.

In [22]:
all_heroes_ts = df['gold_t'].apply(lambda x: pd.Series(x))

max_gold = all_heroes_ts.max().max()
min_gold = 0

def ConstantMinMaxScaler(series,min_gold,max_gold):
    #print(type(series))
    #series = np.ndarray(series)
    dataset_scaled = (series - min_gold)/(max_gold - min_gold)
    return dataset_scaled

def ConstantUnScaler(series, min_gold, max_gold):
    #series = np.ndarray(series)
    dataset_unscaled = np.round((series * (max_gold - min_gold)) + min_gold)
    return dataset_unscaled

Scaling Data

In [23]:
#scaled_df20 = (df_20_series - min_gold)/(max_gold - min_gold)

#scaled_df20

In [24]:
n_steps = 9
predict_steps = 2
# Due to 0 indexing, :n_steps would pull the first 10 steps.. which would be indexes from 0 to 9
# calling n_steps, without the colon like in line 8 where we create y, would pull the index at 10
# This allows us to only declare n_steps

def split_sequence(sequence, n_steps, predict_steps):

    X = sequence.iloc[:, :n_steps]
    y = sequence.iloc[:, n_steps:n_steps+predict_steps]

    return X,y

#X,y = split_sequence(df_20_series,n_steps, predict_steps)
#X_scaled,y_scaled = split_sequence(scaled_df20, n_steps, predict_steps)

#print(X)
#print(y)

In [25]:
#X_scaled

In [26]:
#y_scaled

In [27]:
#import matplotlib.pyplot as plt
#import pandas as pd

#df_first_10 = df_20_series.iloc[:10, :10]

#fig, ax = plt.subplots(figsize=(10, 6))

#for i in range(len(df_first_10)):
#    ax.plot(df_first_10.columns, df_first_10.iloc[i], label=f'Row {i}')
#plt.show()

### Normalizing Data

In [28]:
# data = df_subset.gold_t

# from pandas import Series
# from sklearn.preprocessing import MinMaxScaler

# # define contrived series
# series = data.apply(pd.Series)
# print(series)

# # prepare data for normalization
# values = series.values


# scaler = MinMaxScaler(feature_range=(0, 1))
# scaler = scaler.fit(values)
# print('Min: %f, Max: %f' % (scaler.data_min_, scaler.data_max_))

# #values = values.reshape((len(values), 1))
