Importing Libraries

In [1]:
import sys
import os
import numpy as np
import renders as rs
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from IPython.display import display # Allows the use of display() for DataFrames
from sklearn.externals import joblib
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFECV
from sklearn.cross_validation import StratifiedKFold
from sklearn.svm import SVC
# Show matplotlib plots inline (nicely formatted in the notebook)
%matplotlib inline
sys.path.append('/Users/senzari/Machine_Learning/stats/src/stats/match_stats.py')
sys.path.append('/Users/senzari/Machine_Learning/stats/src/stats/form_model.py')
sys.path.append('/anaconda/envs/stats/lib/python3.5/site-packages')
sys.path.append('/Users/senzari/Machine_Learning/stats/src/stats/model_libs.py')
sys.path.append('/Users/senzari/Machine_Learning/stats/src/stats/form_data.py')
sys.path.append('/Users/senzari/Machine_Learning/stats')
sys.path.append('/Users/senzari/Machine_Learning/stats/src/stats')
# print(sys.path)
os.chdir('/Users/senzari/Machine_Learning/stats/src')
#print(os.getcwd())

# Variables
round_number = 26
target_col = 'points'
ignore_cols = ['match_id', 'team_id', 'team_name', 'opp_id', 'opp_name', 'scheduled']
testing = False

from stats import form_data, match_stats, model_libs, form_model, predict_matches

INITIALIZED...


Initially use features just in database, ran PCA and tried to enhance the features that had the most influence...

Data essentially has 2 variances.  Home vs Away stats and the Extended Features of Current Team, Opponent, Current Teams Previous Opponents, and Opponents Opponent

In [2]:
data_csv = 'round' + str(round_number) + '.csv'

if testing:
    raw_data = form_data.run_data()
    raw_data.to_csv(data_csv)
    print("Raw Data Saved to CSV")
else:
    #Reading in a CSV adds the first index column
    raw_data = pd.read_csv(data_csv)
    raw_data = raw_data.drop(raw_data.columns[[0]], axis=1)
    
td = model_libs._clone_and_drop(raw_data, ignore_cols)
(y, X) = model_libs._extract_target(td, target_col)

pd.set_option("display.max_columns", 85)
print('Data Loaded...')

Data Loaded...


In [None]:
print(raw_data.shape)
#display(raw_data.head())

In [3]:
models = ['svc', 'gmm', 'knn', 'gnb']

"""No Cross-Validation and No Tuning"""
def train_models(round_num, X, y):
    
    if os.path.isdir("/models/" + str(round_num)):
        print('Making New Directory for the Round')
        os.chdir('/Users/senzari/Machine_Learning/stats/src/models')
        os.makedirs(str(round_num))
        os.chdir('/Users/senzari/Machine_Learning/stats/src')
    
    for i in models:
        
        model_round = 'models/' + str(round_num) + '/' + str(i) + '_round_' + str(round_num) + '.pk1'
        
        if i == 'svc':
            svc = form_model.build_model(X, y, i)
            joblib.dump(svc, model_round)
        elif i == 'gmm':
            gmm = form_model.build_model(X, y, i)
            joblib.dump(gmm, model_round)
        elif i == 'knn':
            kmeans = form_model.build_model(X, y, i)
            joblib.dump(kmeans, model_round)
        elif i == 'gnb':
            gnb = form_model.build_model(X, y, i)
            joblib.dump(gnb, model_round)
        
    return svc, gmm, kmeans, gnb
        
        
def load_models(round_num):
    
    for i in models:
        model_round = 'models/' + str(round_num) + '/' + str(i) + '_round_' + str(round_num) + '.pk1'
        if i == 'svc':
             svc= joblib.load(model_round)
        elif i == 'gmm':
            gmm = joblib.load(model_round)
        elif i == 'knn':
            kmeans = joblib.load(model_round)
        elif i == 'gnb':
            gnb = joblib.load(model_round)
        
        print("Success :: Loaded - " + str(i))
        
    return svc, gmm, kmeans, gnb


# svc, gmm, knn, gnb = train_models(round_number, X, y)
svc, gmm, knn, gnb = load_models(round_number)

Training SVC Model
F1 score for training set: 0.708971037196
F1 score for test set: 0.410595682856
Finished SVC Modeling
Training GMM Modeling
Silhouette Score :: 0.163906657639 for 2 Clusters
Silhouette Score :: 0.124887130696 for 3 Clusters
Silhouette Score :: 0.128291780384 for 4 Clusters
Finished GMM Modeling
Training K Neighbors Classifier Model
KNN Score :: 0.683333333333
Finished K-Means Modeling
Training Gaussian NB Model
F1 score for training set: 0.565839547885
F1 score for test set: 0.37443718228
Finished Gaussian NB Modeling


Find all matches

In [4]:
upcoming_matches, match_details = predict_matches.get_upcoming_matches()
print(upcoming_matches)

   match_id           scheduled  home_id  away_id               home_team  \
0       486 2016-09-01 23:00:00       24       33        New York City FC   
1       487 2016-09-03 23:00:00       28       31  Vancouver Whitecaps FC   
2       488 2016-09-03 23:30:00       36       22  New England Revolution   
3       489 2016-09-04 00:30:00       40       25            Chicago Fire   
4       490 2016-09-04 01:00:00       21       30               FC Dallas   
5       491 2016-09-04 02:30:00       26       39               LA Galaxy   

            away_team  
0           DC United  
1        NY Red Bulls  
2     Colorado Rapids  
3  Philadelphia Union  
4    Portland Timbers  
5       Columbus Crew  


In [5]:
for i in models:
    if i == 'svc':
        svc_preds, upcoming_data = predict_matches.predictions(upcoming_matches, match_details, svc)
        print(svc_preds)
    elif i == 'gmm':
        gmm_preds, upcoming_data = predict_matches.predictions(upcoming_matches, match_details, gmm)
        print(gmm_preds)
    elif i == 'knn':
        knn_preds, upcoming_data = predict_matches.predictions(upcoming_matches, match_details, knn)
        print(knn_preds)
    elif i == 'gnb':
        gnb_preds, upcoming_data = predict_matches.predictions(upcoming_matches, match_details, gnb)
        print(gnb_preds)

[ 1.  3.  1.  0.  3.  3.  1.  0.  1.  0.  0.  3.]
[3 2 3 3 3 2 3 1 3 1 3 3]
[ 3.  0.  1.  3.  0.  0.  0.  0.  1.  0.  0.  0.]
[ 1.  1.  1.  1.  1.  3.  1.  0.  1.  1.  1.  1.]


Adds list of predictions to the upcoming matches and puts them in a CSV

In [6]:
columns = ['team_name', 'opp_name', 'scheduled']
# Remove all columns except the ones above
upcoming_matches = upcoming_data[columns]
# Add predictions to the end of that DF
results = pd.DataFrame({'SVC': svc_preds, 'GMM': gmm_preds, 'KNN': knn_preds, 'GNB': gnb_preds})
upcoming_matches = upcoming_matches.join(results)
reordered_matches = pd.DataFrame([])

for rows in upcoming_matches.iterrows():
    for i in upcoming_matches['team_name']:
        if rows[1]['opp_name'] == i:
            reordered_matches = reordered_matches.append(rows[1])
            reordered_matches = reordered_matches.append(upcoming_matches[upcoming_matches['team_name'].isin([i])])

reordered_matches = reordered_matches.drop_duplicates() 
columns = ['scheduled', 'team_name', 'opp_name', 'SVC', 'KNN', 'GMM', 'GNB']
reordered_matches = reordered_matches[columns]
# upcoming_matches = upcoming_matches[(upcoming_matches['scheduled'] < '2016-08-26')]
reordered_matches.to_csv('predictions_' + str(round_number) + '.csv')
print('Prediction CSV saved')

Tune Models
Doesn't really do anything now, need to ask some questions to get better.  Maybe removing some features will help.

In [None]:
for i in models:
    form_model.build_tuned_model(X, y, i)

Trying to remove some useless features

In [None]:
print(raw_data.shape)
reduced_data = form_data.variance_features(raw_data)
print(reduced_data.shape)

In [None]:
svc = SVC(kernel="linear")
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2),
              scoring='accuracy')
rfecv.fit(X, y)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

# Need to normalize the data
"""norm_data = X.copy(deep=True)
norm_data = norm_data.apply(lambda x: (x - x.min()) / (x.max() - x.min()))
X_new = SelectKBest(chi2, k=50).fit_transform(norm_data, y)
print(X.shape)"""

In [None]:
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE

# Create the RFE object and rank each pixel
svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
rfe.fit(X, y)
print(rfe.ranking_)


In [None]:
ranked_features = pd.DataFrame(index=range(448))
for a in range(len(rfe.ranking_)):
    for i in range(len(rfe.ranking_)):
        if rfe.ranking_[i] == a+1:
            ranked_features = ranked_features.join(X.ix[:,i])
            
display(ranked_features.describe())

Let's try to reduce the number of features to at least 25 although optimal seems likely around ~8 features

In [None]:
for x in xrange(5, 25):
    print('X :: {}'.format(x))
    svc, gmm, knn, gnb = train_models(round_number, ranked_features.iloc[:, 1:x], y)
    print(' ============================ ')
    

try PCA on original training data 

In [None]:
pca = PCA(n_components=5)
pca.fit(X)
# Generate PCA results plot
pca_results = rs.pca_results(X, pca)

In [None]:
# TODO: Apply a PCA transformation the good data
reduced_data = pca.transform(X)
# display(pd.DataFrame(np.round(reduced_data, 4), columns = ['D1', 'D2', 'D3', 'D4', 'D5']))
svc, gmm, knn, gnb = train_models(round_number, reduced_data, y)

In [None]:
Try PCA on ranked features 

In [None]:
svc, gmm, knn, gnb = train_models(round_number, reduced_data, y)

In [None]:
pca = PCA(n_components=5)
pca.fit(ranked_features.iloc[:, 1:23]) # 23 was just the highest value when testing
# Generate PCA results plot
pca_results = rs.pca_results(ranked_features.iloc[:, 1:23], pca)

In [None]:
# TODO: Apply a PCA transformation the good data
reduced_data = pca.transform(ranked_features.iloc[:, 1:23])
# display(pd.DataFrame(np.round(reduced_data, 4), columns = ['D1', 'D2', 'D3', 'D4', 'D5']))
svc, gmm, knn, gnb = train_models(round_number, reduced_data, y)

Let's try removing ties similar to the other project

In [None]:
target_col = 'points'
ignore_cols = ['match_id', 'team_id', 'team_name', 'opp_id', 'opp_name', 'scheduled']

filtered_data = raw_data[raw_data.points != 1]
fd = model_libs._clone_and_drop(filtered_data, ignore_cols)
(f_y, f_X) = model_libs._extract_target(fd, target_col)
print(f_X.shape)

In [None]:
svc, gmm, knn, gnb = train_models(round_number, f_X, f_y)

In [None]:
# ha = home-away
standard_cols = ['games_played', 'is_home', 'avg_points', 'goals_for', 'goals_against', 'avg_goals', 'margin', 'goal_diff',
                'win_percentage', 'sos', 'opp_avg_points', 'opp_avg_goals', 'opp_margin', 'opp_goal_diff', 'opp_win_percentage',
                'opp_opp_record', 'points']
# 17 each
home_cols = ['current_team_home_possession', 'current_team_home_attacks', 'current_team_home_dangerous_attacks', 'current_team_home_yellow_card',
            'current_team_home_corner_kicks', 'current_team_home_shots_on_target', 'current_team_home_shots_total', 'current_team_home_ball_safe',
            'current_team_home_played', 'current_opp_away_attacks', 'current_opp_away_dangerous_attacks', 'current_opp_away_yellow_card',
            'current_opp_away_corner_kicks', 'current_opp_away_shots_on_target', 'current_opp_away_shots_total', 'current_opp_away_ball_safe',
            'current_opp_away_played']
away_cols = ['current_team_away_possession', 'current_team_away_attacks', 'current_team_away_dangerous_attacks', 'current_team_away_yellow_card', 
            'current_team_away_corner_kicks', 'current_team_away_shots_on_target', 'current_team_away_shots_total', 'current_team_away_ball_safe', 
            'current_team_away_played', 'current_opp_home_attacks', 'current_opp_home_dangerous_attacks', 'current_opp_home_yellow_card', 'current_opp_home_corner_kicks', 
             'current_opp_home_shots_on_target', 'current_opp_home_shots_total', 'current_opp_home_ball_safe', 'current_opp_home_played']

def rename_column(label):
    if label.count('_home_') > 0:
        new_name = label.replace('_home_', '_')
    elif label.count('_away_') > 0:
        new_name = label.replace('_away_', '_')
    return new_name

# Creates new column list for merged columns
merged_cols = []

for cols in home_cols:
    merged_cols.append(rename_column(cols))
    
# Creates a Home and Away Table with Standard and Home/Away Columns.  Will evenutally merge together
ha_data = model_libs._clone_and_drop(raw_data, ignore_cols)
home_data = ha_data.loc[ha_data.loc[:, 'is_home'] == 1, standard_cols + home_cols]
away_data = ha_data.loc[ha_data.loc[:, 'is_home'] == 0, standard_cols + away_cols]

#display(away_data.describe())
#display(home_data.describe())

# Appends the Home Table with the Away Table
ha_data = home_data.append(away_data)

def pick_column(home, away):
    if np.isnan(home):
        return away
    elif np.isnan(away):
        return home

# Combine Home/Away Columns into one for Current Team and Opponent.  Cuts down on Features
# Adds from whatever column that doesn't have Nan
for c in range(len(merged_cols)):
    ha_data[merged_cols[c]] = ha_data.apply(lambda row: pick_column(row[home_cols[c]], row[away_cols[c]]), axis=1 )

ha_data = ha_data.drop(home_cols + away_cols, axis=1)
    
display(ha_data.describe())

print(ha_data.shape)
#(y, X) = model_libs._extract_target(home_away_data, target_col)

In [None]:
(y, X) = model_libs._extract_target(ha_data, target_col)
print(X.shape)

In [None]:
svc, gmm, knn, gnb = train_models(round_number, X, y)

Now testing table with standard and extended features

In [None]:
# ef = extended features
ef_data = model_libs._clone_and_drop(raw_data, ignore_cols)
print(ef_data.shape)
ef_data = ef_data.drop(home_cols + away_cols, axis=1)
print(ef_data.shape)

In [None]:
(y, X) = model_libs._extract_target(ef_data, target_col)
svc, gmm, kmeans, gnb = train_models(round_number, X, y)