# Capstone Data Processing and Model Training

Data obtained through Riot API and cached on local MongoDB server

In [1]:
import datetime
import numpy as np
import pandas as pd
from pymongo import MongoClient
client = MongoClient("mongodb://MongoDB.lan:27017/")
db = client.RiotDB

## Obtain List of Games for Analysis

`matchListNA1` for games on NA1 server only, `matchListGlobal` for games on all global servers

In [2]:
matchListNA1 = db.MatchStats.distinct("platformId_gameId", {"queueId": 450, "platformId": "NA1", "gameVersion": "8.9.228.4283"})
print(len(matchListNA1))
matchListGlobal = db.MatchStats.distinct("platformId_gameId", {"queueId": 450, "gameVersion": "8.9.228.4283"})
print(len(matchListGlobal))

7981
68385


`matchListFull` for production, `matchListTest` for development/debug

In [3]:
#matchListFull = matchListListNA1
matchListFull = matchListGlobal
#matchListTest = list(np.random.choice(matchListFull, 5))

## Construct Features/Target Matrix (development)

Project game events 

In [4]:
projectTeam = {
    "_id": 0,
    "platformId_gameId": 1,
    "timestamp": 1,
    "KillChange": {
        "$cond": {
            "if": {"$eq": ["$type", "CHAMPION_KILL"]},
            "then": {
                "$cond": {
                    "if": {"$gt": ["$victimId", 5]},
                    "then": 1,
                    "else": -1
                }
            },
            "else": 0
        }
    },
    "TurretChange": {
        "$cond": {
            "if": {"$and": [{"$eq": ["$type", "BUILDING_KILL"]}, {"$eq": ["$buildingType", "TOWER_BUILDING"]}]},
            "then": {
                "$cond": {
                    "if": {"$gt": ["$teamId", 150]},
                    "then": 1,
                    "else": -1
                }
            },
            "else": 0
        }
    },
    "InhibChange": {
        "$cond": {
            "if": {"$and": [{"$eq": ["$type", "BUILDING_KILL"]}, {"$eq": ["$buildingType", "INHIBITOR_BUILDING"]}]},
            "then": {
                "$cond": {
                    "if": {"$gt": ["$teamId", 150]},
                    "then": 1,
                    "else": -1
                }
            },
            "else": 0
        }
    }
}

Retrieve game events

In [5]:
pipeline = [
        {"$match": {"$or": [{"type": "CHAMPION_KILL"}, {"type": "BUILDING_KILL"}], "platformId_gameId": {"$in": matchListFull}}},
        {"$project": projectTeam}
    ]
team_change_df = pd.DataFrame(list(db.MatchEvents.aggregate(pipeline)))
team_change_df.head()

Unnamed: 0,InhibChange,KillChange,TurretChange,platformId_gameId,timestamp
0,0,-1,0,BR1_1371887873,104581
1,0,-1,0,BR1_1371887873,202529
2,0,-1,0,BR1_1371887873,317902
3,0,-1,0,BR1_1371887873,318892
4,0,1,0,BR1_1371887873,321136


Aggregate game events to obtain cumulative state of game at each event

In [6]:
team_score_df = team_change_df.set_index(["platformId_gameId", "timestamp"]).sort_index().rename(columns={"KillChange": "KillDiff", "TurretChange": "TurretDiff", "InhibChange": "InhibDiff"}).groupby("platformId_gameId").cumsum().reset_index()
team_score_df.head()

Unnamed: 0,platformId_gameId,timestamp,InhibDiff,KillDiff,TurretDiff
0,BR1_1371887873,104581,0,-1,0
1,BR1_1371887873,202529,0,-2,0
2,BR1_1371887873,317902,0,-3,0
3,BR1_1371887873,318892,0,-4,0
4,BR1_1371887873,321136,0,-3,0


Retrieve game outcomes

In [7]:
pipeline = [
    {"$match": {"platformId_gameId": {"$in": matchListFull}}},
    {"$project": {
        "_id": 0,
        "platformId_gameId": 1,
        "team_stats": {"$arrayElemAt": ["$participants.stats.win", 0]}
    }}
]
team_win_df = pd.DataFrame(list(db.MatchStats.aggregate(pipeline)))
team_win_df.head()

Unnamed: 0,platformId_gameId,team_stats
0,BR1_1371887873,False
1,BR1_1371888041,True
2,BR1_1371888334,True
3,BR1_1371890213,False
4,BR1_1371891246,True


Match and merge features and targets

In [8]:
data_df = pd.merge(team_score_df, team_win_df, how="inner", on="platformId_gameId")
data_df.head()

Unnamed: 0,platformId_gameId,timestamp,InhibDiff,KillDiff,TurretDiff,team_stats
0,BR1_1371887873,104581,0,-1,0,False
1,BR1_1371887873,202529,0,-2,0,False
2,BR1_1371887873,317902,0,-3,0,False
3,BR1_1371887873,318892,0,-4,0,False
4,BR1_1371887873,321136,0,-3,0,False


Split features/target matrix

In [9]:
features_df = data_df.drop(columns=["platformId_gameId", "team_stats"])
target_df = data_df["team_stats"].astype("int")
len(target_df)

5526557

## Implement Features/Target Matrix Pipeline

In [10]:
def teamData(matchList):
    pipeline = [
        {"$match": {"$or": [{"type": "CHAMPION_KILL"}, {"type": "BUILDING_KILL"}], "platformId_gameId": {"$in": matchList}}},
        {"$project": projectTeam}
    ]
    team_change_df = pd.DataFrame(list(db.MatchEvents.aggregate(pipeline)))
    team_score_df = team_change_df.set_index(["platformId_gameId", "timestamp"]).sort_index().rename(columns={"KillChange": "KillDiff", "TurretChange": "TurretDiff", "InhibChange": "InhibDiff"}).groupby("platformId_gameId").cumsum().reset_index()

    pipeline = [
        {"$match": {"platformId_gameId": {"$in": matchList}}},
        {"$project": {
            "_id": 0,
            "platformId_gameId": 1,
            "team_stats": {"$arrayElemAt": ["$participants.stats.win", 0]}
        }}
    ]
    team_win_df = pd.DataFrame(list(db.MatchStats.aggregate(pipeline)))
    
    data_df = pd.merge(team_score_df, team_win_df, how="inner", on="platformId_gameId")
    features_df = data_df.drop(columns=["platformId_gameId", "team_stats"])
    target_df = data_df["team_stats"].astype("int")
    return features_df, target_df

## Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

matchList_train, matchList_test = train_test_split(matchListFull)
features_train, target_train = teamData(matchList_train)
print(len(target_train))
print(matchList_train[:5])
features_test, target_test = teamData(matchList_test)
print(len(target_test))
print(matchList_test[:5])

## Model Training Hyperparameter Optimization by CV

In [None]:
from joblib import Parallel, delayed
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def parcv(n_est, depth, leaf):
    model_rfc = RandomForestClassifier(oob_score=True, max_features=None, n_estimators=n_est, max_depth=depth, min_samples_leaf=leaf)
    scoreArray = cross_val_score(model_rfc, features_train, target_train)
    scoreDict = {'n_est': n_est, 'depth': depth, 'leaf': leaf, 'score_mean': np.mean(scoreArray), 'score_std': np.std(scoreArray)}
    print(scoreDict)
    return scoreDict

n_estArray = np.logspace(5, 7, num=3, base=2).astype(int)
depthArray = np.linspace(16, 20, num=3).astype(int)
leafArray = np.logspace(5, 6, num=2, base=2).astype(int)
cv_results = Parallel(n_jobs=-3, verbose=100)(delayed(parcv)(n, d, l) for n in n_estArray for d in depthArray for l in leafArray)
cv_df = pd.DataFrame(cv_results)
print(cv_df.iloc[cv_df['score_mean'].idxmax()])

##  Train and Persist Model with Optimized Hyperparameters

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rfc = RandomForestClassifier(oob_score = True, max_features = None, n_estimators = 64, min_samples_leaf = 64)
model_rfc.fit(features_train, target_train)

In [None]:
from joblib import dump
dump(model_rfc, "model_rfc_NA1_nest64_leaf64.pkl")

In [None]:
from joblib import dump
dump(model_rfc, "model_rfc_Global_nest64_leaf64.pkl")

In [None]:
from joblib import dump
dump([matchList_train, matchList_test, features_train, target_train, features_test, target_test], "model_data_Global.pkl")

In [12]:
client.close()