### Guide to this File:

This file is intended to be ran from other notebooks. This file will create the following objects:

df : match data [match_id, hero_id, gold_t]
df_no_match : match data without match_id [hero_id, gold_t]

missing_indices : indices that had missing gold_t values and were removed, use with print() to view
df.loc[missing_indices, ['hero_id','gold_t']] : to be used to view missing_indices

zero_length_indices : the indices containing zero length gold_t values and were removed
zero_length_records : the records containing zero length gold_t values and were removed




In [5]:
import json
import pandas as pd 
import requests
from numpy import array

import torch
import torch.nn
import torch.optim as optim

# Used in LTSMModel Class Instantiation
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [6]:
# Local Load

file_path = "C:\\Users\\dcrai\\source\\repos\\DATA698\\Code\\Data\\data.json"
file_path_hero = "C:\\Users\\dcrai\\source\\repos\\DATA698\\Code\\Data\\hero_id_table.csv"
#file_csv = "C:\\Users\\dcrai\\source\\repos\\DATA698\\Code\\Data\\iter_1.csv" - different match data

with open(file_path, 'r') as file:
    data = json.load(file)

# Github Load

#file_path = "https://raw.githubusercontent.com/d-ev-craig/DATA698/main/Code/Data/data.json"
#file_path_hero = "https://raw.githubusercontent.com/d-ev-craig/DATA698/main/Code/Data/hero_id_table.csv"

#response = requests.get(file_path)
#data = json.loads(response.text)

# Create heroes dataframe
heroes= pd.read_csv(file_path_hero)

# Extract 'match_id', 'hero_id', and 'gold_t' from each element in 'data'
match_ids = [element['match_id'] for element in data]
hero_ids = [element['hero_id'] for element in data]
gold_t_values = [element['gold_t'] for element in data]

# Create match data dataframe
df = pd.DataFrame({'match_id': match_ids, 'hero_id': hero_ids, 'gold_t': gold_t_values})




In [7]:
# Match ID example
#7517376613

In [8]:
# Checking zero length tensors

#df.iloc[2470]
#df.iloc[2511]

### Missing Data Drop

In [9]:
missing_mask = df['gold_t'].isnull()
missing_indices = df.index[missing_mask]
#print(missing_indices)
df.loc[missing_indices, ['hero_id','gold_t']]

df = df.dropna(subset=['gold_t'])
df = df.reset_index(drop=True) # Reset the indexes so no issues arise with using index locations to drop 0 length tensors

### Zero Length Tensors Drop

If we don't drop the zero length tensors, when we call `model(hero_ids,time_series)` in the training loop, it will error out when attempting to run pad_packed_sequence


In [10]:
zero_length_indices = []

for index, time_series in enumerate(df['gold_t']):
    if len(time_series) == 0:
        zero_length_indices.append(index)

if len(zero_length_indices) > 0:
    print(f"Found {len(zero_length_indices)} tensors with length 0 at indices: {zero_length_indices}")
else:
    print("No tensors with length 0 found.")

Found 4 tensors with length 0 at indices: [2460, 2501, 2512, 2583]


In [11]:
zero_length_records = df.loc[zero_length_indices, ['hero_id','gold_t']]

#df_full.iloc[2583][['hero_id','gold_t']]
#len(df_full.iloc[2460]['gold_t'])

In [12]:
# Different methods to reduce data size for testing

#df_match = df[df['match_id'] == 7517376613]
#df_match = df[:300]
#df_match

#df_subset = df_match[['hero_id', 'gold_t']].copy()
#df_subset

df_no_match = df[['hero_id', 'gold_t']].copy()
df_no_match = df_no_match.drop(zero_length_indices)

### Normalizing Data

In [None]:
# data = df_subset.gold_t

# from pandas import Series
# from sklearn.preprocessing import MinMaxScaler

# # define contrived series
# series = data.apply(pd.Series)
# print(series)

# # prepare data for normalization
# values = series.values


# scaler = MinMaxScaler(feature_range=(0, 1))
# scaler = scaler.fit(values)
# print('Min: %f, Max: %f' % (scaler.data_min_, scaler.data_max_))

# #values = values.reshape((len(values), 1))
