In [1]:
from functools import reduce
import sqlite3
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset

In [2]:

def create_dataloader(db_file):
    conn = sqlite3.connect(db_file)

    #Start with the players drawn
    ids = pd.read_sql_query('SELECT DISTINCT id FROM players lIMIT 5000', conn) 
    lst= ids.id.values.tolist()

    #Use player ids to grab the games that they played
    match_players_df = pd.read_sql_query(f'SELECT * FROM match_players LEFT JOIN matches m on m.id = match_players.match_id WHERE player_id IN ({", ".join(str(id) for id in lst)})', conn)
    
    return match_players_df

This is the massive data frame that we won't touch, but rather grab from.

In [3]:
mainframe = create_dataloader('bigdata.db')

In [8]:
print(mainframe.columns)

Index(['id', 'player_id', 'match_id', 'opening_id', 'civilization', 'victory',
       'parser_version', 'time_parsed', 'average_elo', 'map_id', 'time',
       'patch_id', 'ladder_id', 'patch_number'],
      dtype='object')


This cell is dedicated to finding information about the data. Some of the data points have '-1' as a value for the keys. On the documentation, it states that the default value of victory is -1, along with other keys. We also have over 6000 unique openings from the data. In order to not bias the model, we should scrunch these numbers into only a few general broud openings. There are also instances of 0 elo being the average elo, which cannot be correct given that it is not possible in game.

In [1]:
# # 5207 uncertain victories
# print(mainframe.groupby(by='victory').size())


# # Civilization values
# print(mainframe.groupby(by='civilization').size())


# #Opening ids
# print(mainframe.groupby(by='opening_id').size())


# #time of match
# print(mainframe.groupby(by='time').size())


# #Elo
# print(mainframe.groupby(by='average_elo').size())


# #This is how many unique openings the data observes. 6000+
# print(len(pd.unique(mainframe['opening_id'])))

# #This is what openings are chosen
# print(mainframe.groupby(by='opening_id').size())

# #This is how many unique maps that the data uses: 54
# print(len(pd.unique(mainframe['map_id'])))


# #This shows if all columns are of the same size: True
# if mainframe.apply(len).nunique() == 1:
#     print("The length of each column is the same.")
# else:
#     print("The length of each column is different.")


# #This is how much time was spent in the game
# print(mainframe.groupby(by='time_parsed').size())



This cell is dedicated to cleaning up the code. In the data, there are rows of -1 values that do not provide any use and can be safely removed, since they don't account for a significant portion of the data. We also want to change the opening_id values so that they represent only a few different openings.

In [17]:
#Removes rows that contain a -1 value or have a 0 in average elo
altframe = mainframe.loc[(mainframe != -1).all(axis=1) & (mainframe['average_elo'] != 0)]

#Removes patch columns, ladder id, and other id elements not relevant to the game
altframe = altframe.drop('patch_number', axis=1)
altframe = altframe.drop('patch_id', axis=1)
altframe = altframe.drop('time_parsed', axis=1)
altframe = altframe.drop('match_id', axis=1)
altframe = altframe.drop('player_id', axis=1)
altframe = altframe.drop('id', axis=1)

print(altframe.shape)

#Crunch down the opening_ids


(372943, 8)


In [2]:
# #Preprocess and normalize the data

# import numpy as np
# from sklearn.preprocessing import StandardScaler

# def preprocess_data(train_loader, val_loader):
#     # Get the mean and standard deviation of the training data
#     scaler = StandardScaler()
#     for x, y in train_loader:
#         scaler.partial_fit(x.numpy())
#     mean = scaler.mean_
#     std = np.sqrt(scaler.var_)
    
#     # Normalize the data using the mean and standard deviation
#     for loader in [train_loader, val_loader]:
#         for i, (x, y) in enumerate(loader):
#             x = x.numpy()
#             x = (x - mean) / std
#             loader.dataset.data[i] = (torch.from_numpy(x), y)
    
#     return train_loader, val_loader
# train_loader, val_loader = preprocess_data(train_loader, val_loader)
# print("All done with preprocessing and normalizing the data!")

In [None]:
# #Use CART model from sklearn

# from sklearn.tree import DecisionTreeClassifier

# # Create a CART model
# model = DecisionTreeClassifier()

# # Train the model using the training DataLoader
# for x, y in train_loader:
#     model.fit(x.numpy(), y.numpy())
    
# # Evaluate the model on the validation DataLoader
# correct = 0
# total = 0
# for x, y in val_loader:
#     y_pred = model.predict(x.numpy())
#     correct += (y_pred == y.numpy()).sum().item()
#     total += y.shape[0]
# accuracy = correct / total
# print(f"Validation accuracy: {accuracy:.3f}")