In [None]:
import pandas as pd
import numpy as np
from scipy.stats import lognorm
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn import tree

In [None]:
in_file_name = 'data/runs_ju.tsv'
runs = pd.read_csv(in_file_name, delimiter="\t")
runs = runs.assign(leg_id=runs.leg.astype(str))
runs = runs.assign(num_runs=runs.num_runs.astype(str))
runs = runs.drop(["leg", "team"], axis=1)
runs.head()

In [None]:
runs.info()

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}
// # To disable auto-scrolling, execute this javascript in a notebook cell before other cells are executed:

In [None]:
import seaborn as sns
sns.set(rc={})
g = sns.FacetGrid(runs, col="leg_id", hue="year", xlim=(0,runs.team_id.max()), ylim=(4,20), height=6, aspect=1, col_wrap=2, legend_out=False)
g.map(sns.regplot, "team_id", "pace", scatter_kws={'alpha':0.1}, order=2).add_legend()


In [None]:
g = sns.FacetGrid(runs, hue="year", height=8, aspect=2, xlim=(5,20), margin_titles=True, legend_out=False) # no facet here
g.map(sns.distplot, "pace", hist=False).add_legend()


In [None]:
g = sns.FacetGrid(runs, hue="leg_id", height=8, aspect=2, xlim=(5,20), margin_titles=True, legend_out=False) # no facet here
g.map(sns.distplot, "pace", hist=False).add_legend()

In [None]:
g = sns.FacetGrid(runs, hue="num_runs", height=8, aspect=2, xlim=(5,20), ylim=(0,0.4), margin_titles=True, legend_out=False) # no facet here
g.map(sns.distplot, "pace", hist=False).add_legend()

In [None]:
g = sns.FacetGrid(runs, hue="team_country", height=8, aspect=2, xlim=(5,20), ylim=(0,0.5), margin_titles=True, legend_out=False) # no facet here
g.map(sns.distplot, "pace", hist=False).add_legend()

In [None]:
g = sns.FacetGrid(runs, height=8, aspect=2, xlim=(5,25), margin_titles=True) # no facet here
g.map(sns.distplot, "pace", hist=True, fit=lognorm).add_legend()

In [None]:
g = sns.FacetGrid(runs, hue="year", row="leg_id",  aspect=3, xlim=(5,20), margin_titles=True, legend_out=False)
g.map(sns.distplot, "pace", hist=False).add_legend()

In [None]:
g = sns.FacetGrid(runs, row="year", col="leg_id", height=6, xlim=(5,20), margin_titles=True, despine=True)
g.map(sns.distplot, "pace", hist_kws={'alpha':0.8}, fit=lognorm).add_legend()

In [None]:
runs.sort_values(by="pace")

In [None]:
runs.sort_values(by="num_runs")

In [None]:
runs["first_name"] = runs.name.str.split(" ", expand=True).iloc[:,0]

counts = runs["first_name"].value_counts()
top_counts = counts[counts > 200]

runs["top_first_name"] = runs["first_name"]

def top_name(first_name): 
    if first_name in top_counts:
        return first_name
    else:
        return  "NA"
    
runs["top_first_name"] = runs.apply(lambda run: top_name(run["first_name"]), axis=1)
#g.map(sns.regplot, "team_id", "pace", scatter_kws={'alpha':0.1}, order=2).add_legend()
top_counts.describe()

In [None]:
#g = sns.FacetGrid(runs, hue="top_first_name", height=8, aspect=2, xlim=(5,20), margin_titles=True) # no facet here
#g.map(sns.distplot, "pace", hist=False).add_legend()
plt.figure(figsize=(16, 30))
ax_names = sns.boxplot(y="top_first_name", x="pace", data=runs, fliersize=0.5, order=top_counts.index.tolist())
ax_names.set_xlim(4, 20)

In [None]:
team_country_counts = runs["team_country"].value_counts()
team_country_top_counts = team_country_counts[team_country_counts > 200]

runs["top_team_country"] = runs["team_country"]

def top_country(team_country): 
    if team_country in team_country_top_counts:
        return team_country
    else:
        return  "OTHER"
    
runs["top_team_country"] = runs.apply(lambda run: top_country(run["team_country"]), axis=1)
#g.map(sns.regplot, "team_id", "pace", scatter_kws={'alpha':0.1}, order=2).add_legend()


In [None]:
plt.figure(figsize=(16, 10))
ax_names = sns.boxplot(y="top_team_country", x="pace", data=runs, fliersize=0.5, order=team_country_top_counts.index.tolist())
ax_names.set_xlim(4, 20)

In [None]:
g = sns.FacetGrid(runs, hue="top_team_country", height=8, aspect=2, xlim=(5,20), ylim=(0,0.5), margin_titles=True, legend_out=False) # no facet here
g.map(sns.distplot, "pace", hist=False).add_legend()

In [None]:
runs
runs = runs.assign(team_id_log=np.log(runs.team_id))
runs = runs.assign(team_id_log10=np.log10(runs.team_id))
runs = runs.assign(team_id_square=np.square(runs.team_id))

first_names = pd.get_dummies(runs[["top_first_name", "leg_id", "num_runs", "top_team_country"]])
first_names[["team_id_log", "team_id_log10", "team_id_square"]] = runs[["team_id_log", "team_id_log10", "team_id_square"]]
first_names.insert(0, "team_id", runs["team_id"])

import json
with open(f"data/unknown_runners_feature_columns.json", 'w') as outfile:
    json.dump(first_names.columns.tolist(), outfile)

x = first_names.values
y = runs.pace.values

y = y.reshape(len(y), 1)

In [None]:
x.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=2019)

In [None]:
display(x_train.shape)
display(y_train.shape)
display(x_test.shape)

In [None]:
regr = linear_model.LinearRegression()
regr.fit(x_train, y_train)


In [None]:
#first_names.columns.shape
#regr.coef_[0].shape
coefs = pd.DataFrame({'feature':first_names.columns, 'coef':regr.coef_[0]})
coefs['feature'] = coefs['feature'].str.replace('top_first_name_','')
display(coefs.sort_values(by="coef").head(20))
display(coefs.sort_values(by="coef", ascending=False).head(20))

In [None]:
# Make predictions using the testing set
y_pred = regr.predict(x_test)

print("Mean squared error: %.3f"
      % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Explained variance score: %.3f' % r2_score(y_test, y_pred))
y_pred

# Simple linear: Mean squared error: 6.44 Explained variance score: 0.30
# log + square = Mean squared error: 6.21 Explained variance score: 0.320
# 100 first names + leg_id = Mean squared error: 7.842 Explained variance score: 0.101
# 452 first names + leg_id + log + square = Mean squared error: 5.387 Explained variance score: 0.382

In [None]:
# plot it as in the example at http://scikit-learn.org/
plt.scatter(x_test[:,0], y_test,  color='red', alpha=0.01)
plt.scatter(x_test[:,0], y_pred, color='blue', alpha=0.01)
plt.ylim(4, 20)
plt.show()

In [None]:
%%time
rf = RandomForestRegressor(n_estimators=500, random_state=0, verbose=1, n_jobs=-1,
                          max_depth= 31, 
                           max_features="auto", 
                           max_leaf_nodes= 187, 
                           min_impurity_decrease= 0.00026892804687183225, 
                           min_samples_leaf= 0.0027584156528699683, 
                           min_samples_split= 21, 
                           min_weight_fraction_leaf= 0.00837)
rf.fit(x_train, y_train.ravel())
# Make predictions using the testing set
rf_y_pred = rf.predict(x_test)

print("Mean squared error: %.3f"
      % mean_squared_error(y_test, rf_y_pred))
# Explained variance score: 1 is perfect prediction
print("Explained variance score: %.3f" % r2_score(y_test, rf_y_pred))

In [None]:
%%time
import sklearn
gbr = sklearn.ensemble.GradientBoostingRegressor(n_estimators=110,
                                                 criterion='friedman_mse', 
             learning_rate=0.10927990420965396, loss='ls', max_depth=1,
             max_features='auto', max_leaf_nodes=156,
             min_impurity_decrease=0.0, 
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, 
             random_state=0,
             subsample=0.8209381840043655, 
             verbose=1)
#gbr = sklearn.ensemble.GradientBoostingRegressor(n_estimators=3100, random_state=0, verbose=1,  max_features="log2")

gbr.fit(x_train, y_train.ravel())
y_gbr_pred = gbr.predict(x_test)
print("Mean squared error: %.3f" % mean_squared_error(y_test, y_gbr_pred))
# Explained variance score: 1 is perfect prediction
print('Explained variance score: %.3f' % r2_score(y_test, y_gbr_pred))

#print(f"feature_importances_: {gbr.feature_importances_}")
gbr_features = pd.DataFrame({'feature':first_names.columns, 'importance': gbr.feature_importances_})
gbr_features['feature'] = gbr_features['feature'].str.replace('top_first_name_','')
display(gbr_features.sort_values(by="importance", ascending=False))

In [None]:
from time import time
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
from scipy.stats import norm

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.4f} (std: {1:.4f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")




In [None]:
import sklearn
#specify parameters and distributions to sample from
crf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=1000, 
                                                 random_state=0, 
                                                 verbose=1, 
                                                 n_iter_no_change=100, tol=0.000001)

param_dist_gbr = { 
    "max_leaf_nodes": np.rint(np.abs(norm.rvs(loc=156, scale=10, size=1000))).astype("int"), 
    "max_depth": np.rint(np.abs(norm.rvs(loc=1, scale=1, size=1000))).astype("int"), 
    "max_features": ["auto", "sqrt"], 
    "learning_rate": np.abs(norm.rvs(loc=0.11, scale=0.01, size=1000)), 
    "subsample": np.abs(norm.rvs(loc=0.83, scale=0.1, size=1000)) }

n_iter_search = 5 
random_search = RandomizedSearchCV(crf, param_distributions=param_dist_gbr, random_state=2019, 
                                   n_iter=n_iter_search, cv=3, n_jobs=-1, error_score=0, verbose=1)

start = time() 
random_search.fit(x_train, y_train.ravel()) 
print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) 
report(random_search.cv_results_) 
print(random_search.best_estimator_)



In [None]:
crf_y_pred = random_search.predict(x_test)

print("Mean squared error: %.3f"
      % mean_squared_error(y_test, crf_y_pred))
# Explained variance score: 1 is perfect prediction
print('Explained variance score: %.3f' % r2_score(y_test, crf_y_pred))
r2 = r2_score(y_test, crf_y_pred)

with open(f"data/rf-best_estimator_{r2:.3f}.json", 'w') as outfile:
    outfile.write(str(random_search.best_estimator_.get_params()))

In [None]:
# plot it as in the example at http://scikit-learn.org/
#plt.scatter(x_test[:,0], y_test,  color='red', alpha=0.01)
#plt.scatter(x_test[:,0], rf_y_pred, color='blue', alpha=0.01)
#plt.ylim(4, 20)
#plt.show()

In [None]:
#svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1)
#y_rbf_pred = svr_rbf.fit(x_train, y_train.ravel()).predict(x_test)
#print("Mean squared error: %.3f"
#      % mean_squared_error(y_test, y_rbf_pred))
# Explained variance score: 1 is perfect prediction
#print('Explained variance score: %.3f' % r2_score(y_test, y_rbf_pred))


In [None]:
br = linear_model.BayesianRidge()
br.fit(x_train, y_train.ravel())
y_br_pred = br.predict(x_test)
print("Mean squared error: %.3f"
      % mean_squared_error(y_test, y_br_pred))
# Explained variance score: 1 is perfect prediction
print('Explained variance score: %.3f' % r2_score(y_test, y_br_pred))




In [None]:
# Robustly fit linear model with RANSAC algorithm
ransac = linear_model.RANSACRegressor()
ransac.fit(x_train, y_train.ravel())
y_ransac_pred = ransac.predict(x_test)
print("Mean squared error: %.3f"
      % mean_squared_error(y_test, y_ransac_pred))
# Explained variance score: 1 is perfect prediction
print('Explained variance score: %.3f' % r2_score(y_test, y_ransac_pred))


In [None]:
np.log2(17)