In [1]:
import sys
import os
import numpy as np
import renders as rs
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from IPython.display import display # Allows the use of display() for DataFrames
from sklearn.externals import joblib
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.cross_validation import StratifiedKFold
from sklearn.svm import SVC
import matplotlib.cm as cm
# Show matplotlib plots inline (nicely formatted in the notebook)
%matplotlib inline

# Might need to change the path of the included libraries.
sys.path.append('/Users/senzari/Machine_Learning/stats/src/stats/match_stats.py')
sys.path.append('/Users/senzari/Machine_Learning/stats/src/stats/form_model.py')
sys.path.append('/anaconda/envs/stats/lib/python3.5/site-packages')
sys.path.append('/Users/senzari/Machine_Learning/stats/src/stats/model_libs.py')
sys.path.append('/Users/senzari/Machine_Learning/stats/src/stats/form_data.py')
sys.path.append('/Users/senzari/Machine_Learning/stats')
sys.path.append('/Users/senzari/Machine_Learning/stats/src/stats')
# print(sys.path)
os.chdir('/Users/senzari/Machine_Learning/stats/src')
#print(os.getcwd())

from stats import form_data, match_stats, model_libs, form_model, predict_matches

# Variables
round_number = 27 # for MLS only
target_col = 'points'
ignore_cols = ['match_id', 'team_id', 'team_name', 'opp_id', 'opp_name', 'scheduled', 'games_played', 'round']
sub_cols = ['current_formation', 'avg_goals_against', 'goal_diff', 'win_percentage', 'sos',
           'opp_win_percentage', 'opp_sos', 'current_team_yellow_cards', 'current_team_corner_kicks', 'current_team_first_half_goals', 'current_team_sec_half_goals', 
           'opp_team_yellow_cards', 'opp_team_corner_kicks', 'opp_team_first_half_goals', 'opp_team_sec_half_goals']

all_models = ['log', 'svc', 'gmm', 'knn', 'gnb', 'randomForest']

""" this variable 'testing' should be false if using CSV's and not pulling from the database. """
testing = False

INITIALIZED...


In [None]:
data_csv = 'raw_data.csv'

if testing:
    raw_data = form_data.run_data()
    raw_data.to_csv(data_csv)
    print("Raw Data Saved to CSV")
else:
    #Reading in a CSV adds the first index column
    raw_data = pd.read_csv(data_csv)
    raw_data = raw_data.drop(raw_data.columns[[0]], axis=1)

pd.set_option("display.max_columns", 85)
print('Data Loaded...')
print("Dataset size :: {}".format(raw_data.shape))
display(raw_data.head())

## FORMATTING

In [None]:
# Helper Function - Removes Columns to Ignore and Splits the Target Column
def split_target(data):
    td = model_libs._clone_and_drop(data, ignore_cols)
    (y, X) = model_libs._extract_target(td, target_col)
    return X, y

""" Need to do some formatting of the Data before we run the models"""
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)

rankings_data = raw_data.copy()

""" Setting the RPI Quartiles on the raw data """
'''leagues = ["USA", "ENG", "DEU", "ESP", "FRA"]
teams = form_data.get_teams()              
rankings_data = model_libs.convert_sos_rpi(leagues, rankings_data, teams)
rankings_data = rankings_data.drop(['rpi', 'opp_rpi'], axis=1)
display(rankings_data.head(1))'''

rankings_data["offensive_ranking"] = pd.Series(None, index=rankings_data.index)
rankings_data["opp_defensive_ranking"] = pd.Series(None, index=rankings_data.index)

leagues = model_libs.get_leagues_country_codes()
#leagues = { "epl": 'ENG' }
teams = form_data.get_teams()
league_rounds = model_libs.get_leagues_rounds()
test = False
if test:
    """ Going through each League"""
    for key, value in leagues.iteritems():
        print(key)
        country_code = leagues[key]
        round_num = league_rounds[key]
        #round_num = 6
        teams_in_league = teams[teams["country_code"] == country_code]
        """ Looping through the Rounds """
        for i in range(4, round_num):
            print("ROUND :: {} ".format(i))
            offensive_rankings = form_data.get_rankings(teams_in_league, i, "offensive", False)
            rankings = model_libs.quartile_list(offensive_rankings, True)
            offensive_rankings["offensive_rankings_quartiled"] = rankings
            print("Finished with Offensive Rankings")
            #print(offensive_rankings)

            defensive_rankings = form_data.get_rankings(teams_in_league, i, "defensive", False)
            rankings = model_libs.quartile_list(defensive_rankings, False)
            defensive_rankings["defensive_rankings_quartiled"] = rankings
            print("Finished with Defensive Rankings")
            #print(defensive_rankings)

            """ Loop through each Team in the League for that round and assign an Offensive Rank """
            for key, team in teams_in_league.iterrows():
                
                ''' If the team is the team_id then put in their offensive ranking for that game '''
                offensive_rank = offensive_rankings.loc[offensive_rankings[0] == team['id'], "offensive_rankings_quartiled"]
                idx = rankings_data.loc[(rankings_data["team_id"] == team["id"]) 
                        & (rankings_data["round"] == (i)), "offensive_ranking"].index

                rankings_data.set_value(idx, "offensive_ranking", offensive_rank.values[0])
                ''' If the team is the opp then put in their defensive ranking for that game '''
                defensive_rank = defensive_rankings.loc[defensive_rankings[0] == team['id'], "defensive_rankings_quartiled"]
                opp_idx = rankings_data.loc[(rankings_data["opp_id"] == team["id"]) 
                        & (rankings_data["round"] == (i))].index

                rankings_data.set_value(opp_idx, "opp_defensive_ranking", defensive_rank.values[0])
                
                rankings_data.to_csv('rankings_data.csv')
                
else:
    
    rankings_data = pd.read_csv('rankings_data.csv')
    rankings_data = rankings_data.drop(rankings_data.columns[[0]], axis=1)
    
print('Data Loaded...')
                          
""" Formatting data to convert goals scored to the correct category"""
# Not using points as a target for this version, using goals
rankings_data = rankings_data.drop('goals', 1)

#rankings_data['converted_goals'] = rankings_data.apply(lambda row: model_libs.set_group(row['goals']), axis=1)

rankings_data = rankings_data.drop(ignore_cols + ['current_formation'], 1)
display(rankings_data.head())

In [None]:
rankings_data.to_csv('rankings_fully_formatted.csv')

In [2]:
rankings_data = pd.read_csv('rankings_fully_formatted.csv')
rankings_data = rankings_data.drop(rankings_data.columns[[0]], axis=1)

### RUNNING CLASSIFICATION MODEL

In [3]:
def run_features(data, drop_data, target, models):
    
    new_data = data.drop(drop_data, axis=1)
    
    #display(new_data.head())
    
    (y, X) = model_libs._extract_target(new_data, target)
    
    models = form_model.train_models(round_number, X, y, models)
    
    return models

rankings_data = rankings_data.drop(['rpi', 'opp_rpi'], 1)

#### Running ALL Features 
models_test_1 = run_features(rankings_data, [], 'points', ["knn"])

(rankings_y, rankings_X) = model_libs._extract_target(rankings_data, 'points')

def check_accuracy(model, data_X):
    actual_y = pd.DataFrame(rankings_y.values, columns=['actual'])
    predictions = pd.DataFrame(model.predict(data_X), columns=['predictions'])
    preds = pd.concat([predictions, actual_y], axis=1)
    preds['diff'] = preds.apply(lambda r: model_libs.predictions_diff(r['predictions'], r['actual']), axis=1)
    accuracy = np.divide(preds['diff'].sum(), float(len(preds['diff'])))
    print(accuracy)

for m in models_test_1:
    check_accuracy(m, rankings_X)

-----------------------------------
Training K Neighbors Classifier Model
KNN Score on Training Set :: 0.694280078895
KNN Score on Test Set:: 0.338582677165
Finished K-Means Modeling
0.623028391167


In [None]:
def train_tuned_models(round_num, X, y):
    for i in all_models:
        models = form_model.build_tuned_model(X, y, i)
        
    return models
        
tuned_models = train_tuned_models(round_number, classifier_X, classifier_y)

In [None]:
log_model = form_model.build_tuned_model(rankings_X, rankings_y, 'log')

for m in log_model:
    print(m)
    check_accuracy(m, rankings_X)

In [None]:
random_forest_model = form_model.build_tuned_model(rankings_X, rankings_y, 'randomForest')

for m in random_forest_model:
    print(m)
    check_accuracy(m, rankings_X)

In [None]:
knn_model = form_model.build_tuned_model(rankings_X, rankings_y, 'knn')

for m in knn_model:
    check_accuracy(m, rankings_X)

In [None]:
gnb_model = form_model.build_tuned_model(rankings_X, rankings_y, 'gnb')

for m in gnb_model:
    print(m)
    check_accuracy(m, rankings_X)

In [10]:
svc_model = form_model.build_tuned_model(rankings_X, rankings_y, 'svc')

display(rankings_X.columns)
for m in svc_model:
    print(m)
    check_accuracy(m, rankings_X)

Training and Tuning SVC Model
[ 0.36538462  0.38461538  0.48076923  0.46153846  0.41176471  0.54901961
  0.44        0.36734694  0.51020408  0.55102041]
Accuracy: 0.45 (+/- 0.13)
Finished SVC Modeling


Index([u'is_home', u'goals_for', u'goals_allowed', u'opp_goals_for',
       u'opp_goals_allowed', u'goal_efficiency',
       u'opp_defensive_goal_efficiency', u'ratio_of_attacks',
       u'opp_ratio_of_attacks', u'ratio_ball_safe_to_dangerous_attacks',
       u'opp_ratio_ball_safe_to_dangerous_attacks', u'offensive_ranking',
       u'opp_defensive_ranking'],
      dtype='object')

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.08, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'kernel': ['rbf'], 'C': [1, 10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)
0.572555205047


In [None]:
gmm_model = form_model.build_tuned_model(rankings_X, rankings_y, 'gmm')

for m in gmm_model:
    print(m)
    check_accuracy(m, rankings_X)

Cross Validating the SVC model with the PCA data to help prevent overfitting

In [5]:
prediction_models = form_model.load_models(['knn', 'svc', 'randomForest', 'gnb', 'gmm', 'log'])

Success :: Loaded - knn
Success :: Loaded - svc
Success :: Loaded - randomForest
Success :: Loaded - gnb
Success :: Loaded - gmm
Success :: Loaded - log


In [6]:
print('Upcoming matches')
#upcoming_matches, match_details = predict_matches.get_upcoming_matches()
#upcoming_matches.to_csv('upcoming_matches.csv')
upcoming_matches = pd.read_csv('upcoming_matches.csv')
upcoming_matches = upcoming_matches.drop(upcoming_matches.columns[[0]], axis=1)
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
upcoming_data = predict_matches.predictions(upcoming_matches)
print('Data Loaded and Predicted...')

""" Setting the RPI Quartiles on the raw data """
"""leagues = ["USA", "ENG", "DEU", "ESP", "FRA"]
teams = form_data.get_teams()              
upcoming_data = model_libs.convert_sos_rpi(leagues, upcoming_data, teams)
upcoming_data = upcoming_data.drop(['rpi', 'opp_rpi'], axis=1)"""

upcoming_data["offensive_ranking"] = pd.Series(None, index=rankings_data.index)
upcoming_data["opp_defensive_ranking"] = pd.Series(None, index=rankings_data.index)

leagues = model_libs.get_leagues_country_codes()
teams = form_data.get_teams()
league_rounds = model_libs.get_leagues_rounds()
test = False
if test:
    """ Going through each League"""
    for key, value in leagues.iteritems():
        print(key)
        country_code = leagues[key]
        round_num = league_rounds[key]
        teams_in_league = teams[teams["country_code"] == country_code]
        
        print("ROUND :: {} ".format(round_num))
        offensive_rankings = form_data.get_rankings(teams_in_league, round_num, "offensive", True)
        rankings = model_libs.quartile_list(offensive_rankings, True)
        offensive_rankings["offensive_rankings_quartiled"] = rankings
        #print(offensive_rankings)
        print("Finished with Offensive Rankings")

        defensive_rankings = form_data.get_rankings(teams_in_league, round_num, "defensive", True)
        rankings = model_libs.quartile_list(defensive_rankings, False)
        defensive_rankings["defensive_rankings_quartiled"] = rankings
        #print(defensive_rankings)
        print("Finished with Defensive Rankings")

        """ Loop through each Team in the League for that round and assign an Offensive Rank """
        for key, team in teams_in_league.iterrows():
                
            ''' If the team is the team_id then put in their offensive ranking for that game '''
            offensive_rank = offensive_rankings.loc[offensive_rankings[0] == team['id'], "offensive_rankings_quartiled"]
            idx = upcoming_data.loc[(upcoming_data["team_id"] == team["id"]) 
                    & (upcoming_data["round"] == (round_num)), "offensive_ranking"].index
            upcoming_data.set_value(idx, "offensive_ranking", offensive_rank.values[0])
            
            ''' If the team is the opp then put in their defensive ranking for that game '''
            defensive_rank = defensive_rankings.loc[defensive_rankings[0] == team['id'], "defensive_rankings_quartiled"]
            opp_idx = upcoming_data.loc[(upcoming_data["opp_id"] == team["id"]) 
                    & (upcoming_data["round"] == (round_num))].index

            upcoming_data.set_value(opp_idx, "opp_defensive_ranking", defensive_rank.values[0])
                
        upcoming_data.to_csv('upcoming_formatted_matches.csv')
                
else:
    
    upcoming_data = pd.read_csv('upcoming_formatted_matches.csv')
    upcoming_data = upcoming_data.drop(upcoming_data.columns[[0]], axis=1)



print('Added Rankings to Upcoming Matches')

Upcoming matches
Data Loaded and Predicted...
Added Rankings to Upcoming Matches


In [7]:
""" Need to remove the same columns from the data the same way we did on the raw data """
upcoming_formatted_data = upcoming_data.drop(['goals', 'rpi', 'opp_rpi'], 1)

upcoming_formatted_data = upcoming_formatted_data.drop(ignore_cols + ['current_formation', 'points'], 1)
display(upcoming_formatted_data.columns)
display(upcoming_formatted_data.head())

Index([u'is_home', u'goals_for', u'goals_allowed', u'opp_goals_for',
       u'opp_goals_allowed', u'goal_efficiency',
       u'opp_defensive_goal_efficiency', u'ratio_of_attacks',
       u'opp_ratio_of_attacks', u'ratio_ball_safe_to_dangerous_attacks',
       u'opp_ratio_ball_safe_to_dangerous_attacks', u'offensive_ranking',
       u'opp_defensive_ranking'],
      dtype='object')

Unnamed: 0,is_home,goals_for,goals_allowed,opp_goals_for,opp_goals_allowed,goal_efficiency,opp_defensive_goal_efficiency,ratio_of_attacks,opp_ratio_of_attacks,ratio_ball_safe_to_dangerous_attacks,opp_ratio_ball_safe_to_dangerous_attacks,offensive_ranking,opp_defensive_ranking
0,0,5,4,3,5,0.128205,0.782609,0.541528,0.396947,1.264706,0.97037,1.0,1.0
1,0,1,2,3,3,0.041667,0.896552,0.482993,0.591973,1.05,1.15,0.0,0.66666
2,1,3,5,5,4,0.088235,0.8,0.561934,0.480263,1.15331,0.924012,0.33333,1.0
3,1,6,7,6,4,0.214286,0.888889,0.480263,0.490506,0.924012,1.029316,0.33333,0.33333
4,0,2,6,5,5,0.064516,0.791667,0.422713,0.505119,1.096886,1.042705,0.33333,1.0


In [43]:
import math

# I've only implemented the linear and rbf kernels
def kernel(params, sv, X):
    if params["kernel"] == 'linear':
        return [np.dot(vi, X) for vi in sv]
    elif params["kernel"] == 'rbf':
        return [math.exp(-params['gamma'] * np.vdot((vi - X).T, vi - X)) for vi in sv]

# This replicates clf.decision_function(X)
def decision_function(params, sv, nv, a, b, X):
    # calculate the kernels
    k = kernel(params, sv, X)
    
    # define the start and end index for support vectors for each class
    start = [sum(nv[:i]) for i in range(len(nv))]
    end = [start[i] + nv[i] for i in range(len(nv))]

    # calculate: sum(a_p * k(x_p, x)) between every 2 classes
    c = [ sum(a[ i ][p] * k[p] for p in range(start[j], end[j])) +
          sum(a[j-1][p] * k[p] for p in range(start[i], end[i]))
                for i in range(len(nv)) for j in range(i+1,len(nv))]

    # add the intercept
    return [sum(x) for x in zip(c, b)]

# This replicates clf.predict(X)
def predict(params, sv, nv, a, b, cs, X):
    ''' params = model parameters
        sv = support vectors
        nv = # of support vectors per class
        a  = dual coefficients
        b  = intercepts 
        cs = list of class names
        X  = feature to predict       
    '''
    decision = decision_function(params, sv, nv, a, b, X)
    votes = [(i if decision[p] > 0 else j) for p,(i,j) in enumerate((i,j) 
                                           for i in range(len(cs))
                                           for j in range(i+1,len(cs)))]

    return cs[max(set(votes), key=votes.count)]

In [44]:
from sklearn.svm import SVC
# Create model
#clf = SVC(gamma=0.001, C=100.)

# Fit model using features, X, and labels, Y.
#clf.fit(X, y)

svc = prediction_models[1].best_estimator_

# Get parameters from model
params = svc.get_params()
sv = svc.support_vectors_
nv = svc.n_support_
a  = svc.dual_coef_
b  = svc._intercept_
cs = svc.classes_

'''for vi in sv:
    temp = np.array(vi - upcoming_formatted_data)
    print(temp.shape)
    print(type(temp))
    np.vdot(temp, temp)
    np.dot(temp.T, temp)np.dot(temp.T, temp)'''

#print(params)
#print(sv)
#print(nv)
#print(a)
#print(b)
#print(cs)

# Use the functions to predict
print(predict(params, sv, nv, a, b, cs, upcoming_formatted_data))

# Compare with the builtin predict
print(svc.predict(upcoming_formatted_data))

3.0
[ 0.  0.  3.  1.  0.  0.  0.  3.  1.  0.  3.  3.  3.  3.  0.  0.  3.  3.
  0.  0.  3.  1.  0.  1.  0.  3.  0.  0.  0.  0.  0.  1.  3.  3.  0.  3.
  0.  1.  0.  3.  3.  0.  0.  0.  0.  3.  3.  3.  3.  3.  1.  0.  1.  0.
  3.  3.  0.  3.  0.  0.  0.  3.  1.  3.  3.  3.  1.  3.  0.  0.  0.  3.
  0.  1.  0.  0.  0.  0.  3.  0.  0.  3.  0.  0.  3.  3.  3.  0.  3.  0.
  3.  0.  3.  0.  3.  0.  0.  0.]


In [None]:
""" Models we'll use to predict on upcoming matches """
# pca_svc_model, knn_model, random_forest_model

# This is all the X values
upcoming_formatted_data

svc_preds = prediction_models[1].predict(upcoming_formatted_data)
svc_decsions = prediction_models[1].decision_function(upcoming_formatted_data)
svc_probs = prediction_models[1].predict_proba(upcoming_formatted_data)

print(svc_decsions.shape)
decisions = pd.DataFrame(svc_decsions)
display(decisions.head(1))
probs = pd.DataFrame(svc_probs)
probs = probs.rename(columns={0: "A", 1: "B", 2: "C"})
display(probs.head(1))
#display(decisions)
#print(svc_decsions)

rf_preds = prediction_models[2].predict(upcoming_formatted_data)
print(rf_preds)
knn_preds = prediction_models[0].predict(upcoming_formatted_data)
print(knn_preds)

gmm_preds = prediction_models[3].predict(upcoming_formatted_data)
print(gmm_preds)

gnb_preds = prediction_models[4].predict(upcoming_formatted_data)
print(gnb_preds)

log_preds = prediction_models[5].predict(upcoming_formatted_data)
print(log_preds)


In [None]:
columns = ['team_name', 'opp_name', 'scheduled']
# Remove all columns except the ones above
#upcoming_matches = upcoming_data[columns]
upcoming_matches = upcoming_data

display(probs.head(1))

#random_preds = pd.Series(np.random.randint(3, size=len(upcoming_matches.index)), upcoming_matches.index)
#random_preds[random_preds == 2] = 3

# Add predictions to the end of that DF
results = pd.DataFrame({'KNN': knn_preds, 'RandomForest': rf_preds, 'GNB': gnb_preds, 'GMM': gmm_preds, 'log': log_preds, 'SVC': svc_preds})

upcoming_matches = pd.concat([upcoming_matches, results, decisions, probs], axis = 1)
display(upcoming_matches.head(1))
reordered_matches = pd.DataFrame([])

for rows in upcoming_matches.iterrows():
    for i in upcoming_matches['team_name']:
        if rows[1]['opp_name'] == i:
            reordered_matches = reordered_matches.append(rows[1])
            reordered_matches = reordered_matches.append(upcoming_matches[upcoming_matches['team_name'].isin([i])])

reordered_matches = reordered_matches.drop_duplicates() 
columns = ['scheduled', 'team_name', 'opp_name', 'goals_for', 'goals_allowed', 'opp_goals_for', 'opp_goals_allowed', 
           'goal_efficiency', 'opp_defensive_goal_efficiency', 'ratio_of_attacks', 
           'opp_ratio_of_attacks', 'ratio_ball_safe_to_dangerous_attacks', 'opp_ratio_ball_safe_to_dangerous_attacks', 
           'offensive_ranking', 'opp_defensive_ranking', 'SVC', 0, 1, 2, 'A', 'B', 'C']
reordered_matches = reordered_matches[columns]
reordered_matches.to_csv('predictions_on_upcoming.csv')
print('Prediction CSV saved')

In [None]:
actual_data = pd.read_csv('predictions_on_upcoming_no_rpi_no_weights.csv')
actual_data = actual_data.drop(actual_data.columns[[0]], axis=1)
display(actual_data.head(1))
actual_data['diff1'] = actual_data.apply(lambda r: model_libs.predictions_diff(r['actual'], r['SVC']), axis=1)
accuracy = np.divide(actual_data['diff1'].sum(), float(len(actual_data['diff1'])))
print(accuracy)

In [None]:
abbreviated_data = actual_data.drop(["KNN", "RandomForest", "GNB", "GMM", "random"], axis=1)
print(abbreviated_data.shape)
display(abbreviated_data)