In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

from sklearn import linear_model
from sklearn import neighbors
from sklearn import svm as svm_classifier
from sklearn.metrics import make_scorer
from sklearn.externals import joblib

from trueskill import BETA
from math import sqrt
from trueskill.backends import cdf

In [None]:
rank = pd.read_csv('rank_.csv')

svm = joblib.load('./ready_models/svm_picks_rates.pkl')
logreg = joblib.load('./ready_models/logreg_picks_rates.pkl')

joblib.dump(logreg, './ready_models/logreg_picks_rates.pkl')

In [None]:
def encodeHeroes(radiant_pick, dire_pick):
    onehotheroes = np.zeros(228)
    for hero in radiant_pick:
        if hero != 0:
            onehotheroes[hero] = 1
    for hero in dire_pick:
        if hero != 0:
            onehotheroes[hero + 114] = 1
    return onehotheroes

In [None]:
def scoreFunc(y, y_pred):
    if y == 1: 
        if np.log2(y_pred) < -5:
            return -5
        return np.log2(y_pred) + 1
    else:
        if np.log2(1 - y_pred) < -5:
            return -5
        return np.log2(1 - y_pred) + 1
log_score = make_scorer(scoreFunc)

In [None]:
poly_and_rates = pd.read_csv('TimeWinRatesPoly.csv')
polies = [[] for i in range(115)]
for index, line in poly_and_rates.iterrows():
    
    pol = [line[6], line[7], line[8], line[9]]
    polies[int(line[1])] = np.poly1d(pol)
    
def calcTeamWinRate(time, pick, polies):
    rates = [polies[i](time) for i in pick]
    return sum(rates)

In [None]:
def get_worth(match):
    radiant_gold_xp = [0,0]
    dire_gold_xp = [0,0]
    
#     get_pick

    radiant_pick = []
    dire_pick = []
    try:    
        radiant_players = match["scoreboard"].values[0]["radiant"]["players"]
        dire_players = match["scoreboard"].values[0]["dire"]["players"]
        time = match["scoreboard"].values[0]["duration"]
        #     print time
        for i in radiant_players:
            radiant_pick.append(i["hero_id"])
            radiant_gold_xp[0]+=i["net_worth"]
            radiant_gold_xp[1]+=i["xp_per_min"]*float(time)/60
        for i in dire_players:
            dire_pick.append(i["hero_id"])
            dire_gold_xp[0]+=i["net_worth"]
            dire_gold_xp[1]+=i["xp_per_min"]*float(time)/60
        radiant_name = match["radiant_team"].values[0]["team_name"]
        dire_name = match["dire_team"].values[0]["team_name"]    

    except:
        return -1
    return match["match_id"].values[0],radiant_gold_xp, dire_gold_xp,radiant_pick,dire_pick,radiant_name,dire_name,time

In [None]:
# XP GOLD PICKS TIME TEAMS
def get_XP_GOLD_from_moment(current_match):
    return get_worth(current_match)
#     print current_match

In [None]:
def get_information_about_one_match(df,match_id):
    current_matches = df[df.match_id == match_id]
#     print current_matches
    result = []
    for i in range (len(current_matches)):
#     i = 3
        result.append(get_XP_GOLD_from_moment(current_matches[i:i+1]))
    return result
#     print current_matches[0:1]
        

In [None]:
f = open('shanghai_live_league_games.jsonlines', 'r')
alldata = []
for line in f:
        alldata.append(json.loads(line))
df = pd.DataFrame((alldata))

In [None]:
# get_information_about_one_match(df,2176488482)
all_match_ids = df["match_id"].unique()
alldata = []
for id in all_match_ids:
    get = get_information_about_one_match(df,id)
    if (get!=-1):
        alldata.append(get)
#THE_BIGGEST_ARRAY[1]

In [None]:
timedata = []
for match in alldata:
    for timeevent in match:
        timedata.append([timeevent])

In [None]:
wins = pd.read_csv('match_results.csv')

In [None]:
def win_probability(player_rating_mu, player_rating_sigma, opponent_rating_mu, opponent_rating_sigma):
    delta_mu = player_rating_mu - opponent_rating_mu
    denom = sqrt(2 * (BETA * BETA) + pow(player_rating_sigma, 2) + pow(opponent_rating_sigma, 2))
    return cdf(delta_mu / denom)

In [341]:
data = pd.DataFrame(columns = ['logreg', 'svm', 'radiant_rank_win_prop', 'radiant_gold_percentage', 'radiant_exp_percentage', 'radiant_winrate_prop', 'winner'])

err = 0 
for m in timedata:
    match = m[0]
    try:

        heroes = encodeHeroes(match[3], match[4])
        radiant_win = wins[wins.match_id == match[0]].radiant_win.values

        logreg_pred = logreg.predict_proba(np.array(list(heroes)).reshape(1, -1))[0][1]
        svm_pred = svm.predict_proba(np.array(list(heroes)).reshape(1, -1))[0][1]
        
        if radiant_win:
            winner = 1
        else:
            winner = 0
        #if row['dire_name'] in rank['0'].values:
        dire_rank_mu = 25
        dire_rank_sigma = 8.3
        radiant_rank_mu = 25
        radiant_rank_sigma = 8.3

        if match[5] in rank['0'].values:
            radiant_rank_mu = rank[rank['0'] == match[5]]['1'].values[0]
            radiant_rank_sigma = rank[rank['0'] == match[5]]['2'].values[0]
        if match[6] in rank['0'].values:
            dire_rank = rank[rank['0'] == match[6]]['1'].values[0]
            dire_rank_sigma = rank[rank['0'] == match[6]]['1'].values[0]
            
        radiant_rank_win_prop = win_probability(radiant_rank_mu, radiant_rank_sigma, dire_rank_mu, dire_rank_sigma)
        duration = match[-1]
        
        radiant_gold = match[1][0]
        dire_gold = match[2][0]
        radiant_gold_percentage = 0.5
        if duration > 450:
            radiant_gold_percentage = 1 / (1 + np.exp(float(radiant_gold - dire_gold) / 15000))
        
        radiant_exp = match[1][1]
        dire_exp = match[2][1]
        radiant_exp_percentage = 0.5
        if duration > 450:
            radiant_exp_percentage = 1 / (1 + np.exp(float(radiant_exp - dire_exp) / 15000))
        
        radiant_winrate_prop = 0.5
        #print match[3]
        #print match[4]
        if not 0 in match[3] and not 0 in match[4]:
            #print 'asasa'
            radiant_winrate = calcTeamWinRate(duration, match[3], polies)
            dire_winrate = calcTeamWinRate(duration, match[4], polies)

            radiant_winrate_prop = 1 / (1 + np.exp((radiant_winrate - dire_winrate) * 2))
        row = [logreg_pred, svm_pred, radiant_rank_win_prop, radiant_gold_percentage, radiant_exp_percentage, radiant_winrate_prop, winner]


        data.loc[len(data)] = row
    except:
        err += 1

In [344]:
np.percentile(data.radiant_winrate_prop.values, 10)


0.3536147324643707

In [None]:
def splitData(data):
    shuffled_data = data.iloc[np.random.permutation(len(data))]
    shuffled_data.reset_index(drop=True)

    train_data = shuffled_data[:int(len(shuffled_data) * 0.7)]
    test_data = shuffled_data[int(len(shuffled_data) * 0.3):]

    
    return train_data, test_data

def prepareAllData(data):
    train_data, test_data = splitData(data)

    X_train = train_data[data.columns[:-1]]

    Y_train = train_data[data.columns[-1:]].values[:,-1].astype('float')

    X_test = test_data[data.columns[:-1]]

    Y_test = test_data[data.columns[-1:]].values[:,-1].astype('float')

    return X_train, Y_train, X_test, Y_test

In [348]:
X_train, Y_train, X_test, Y_test = prepareAllData(data)

In [None]:
from sklearn.grid_search import GridSearchCV

weights = {'logreg' : 1,
           'svm' : 1, 
           'radiant_rank_win_prop' : 1,
           'radiant_gold_percentage' : 1,
           'radiant_exp_percentage' : 1,
           'radiant_winrate_prop' : 1}

final_log_reg =  linear_model.LogisticRegression(C=1e5, class_weight=weights)
test_w = [{'logreg' : i,
           'svm' : i, 
           'radiant_rank_win_prop' : i,
           'radiant_gold_percentage' : i,
           'radiant_exp_percentage' : i,
           'radiant_winrate_prop' : i}  for i in np.linspace(0, 1, 10)]

grid = GridSearchCV(LinearSVC(), param_grid = [{'class_weight' :test_w}], scoring=log_score)

grid.fit(X_train.values.astype('float'), Y_train)


In [311]:
def countCorrect(pred, label, round_flag = False):
    bin_pred = pred #[round(i) for i in range(len(pred))]
    if round_flag:
        bin_pred = np.zeros(len(pred))
        bin_pred[pred > 0.5] = 1
    count = 0
    for i in bin_pred == label:
        if i:
            count += 1

    return float(count)/len(bin_pred)

In [349]:
fin = linear_model.LogisticRegression(C=1e5)

fin.fit(X_train, Y_train)


fin_pred = fin.predict_proba(X_test)[:, 1]

print('logreg: {0}'.format(countCorrect(fin_pred, Y_test, True)))

joblib.dump(fin, './ready_models/fin.pkl') 


logreg: 0.713748984565


['./ready_models/fin.pkl',
 './ready_models/fin.pkl_01.npy',
 './ready_models/fin.pkl_02.npy',
 './ready_models/fin.pkl_03.npy',
 './ready_models/fin.pkl_04.npy']

In [350]:
X_test[1:2] 

Unnamed: 0,logreg,svm,radiant_rank_win_prop,radiant_gold_percentage,radiant_exp_percentage,radiant_winrate_prop
12700,0.639051,0.50701,0.55568,0.870539,0.883074,0.448339


In [355]:
np.percentile(fin_pred, 90)

0.82967816723970689

In [367]:
#X_test[fin_pred > 0.90]

In [357]:
fin_pred > 0.90

array([False, False, False, ..., False, False,  True], dtype=bool)

In [359]:
superpred = []

for row in 

SyntaxError: invalid syntax (<ipython-input-359-0e4b5c099d36>, line 3)

Unnamed: 0.1,Unnamed: 0,0,1,2
201,201,Team Secret,31.076908,0.811236
