Importing Libraries

In [1]:
import sys
import os
import numpy as np
import renders as rs
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from IPython.display import display # Allows the use of display() for DataFrames
from sklearn.externals import joblib
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFECV
from sklearn.cross_validation import StratifiedKFold
from sklearn.svm import SVC
# Show matplotlib plots inline (nicely formatted in the notebook)
%matplotlib inline
sys.path.append('/Users/senzari/Machine_Learning/stats/src/stats/match_stats.py')
sys.path.append('/Users/senzari/Machine_Learning/stats/src/stats/form_model.py')
sys.path.append('/anaconda/envs/stats/lib/python3.5/site-packages')
sys.path.append('/Users/senzari/Machine_Learning/stats/src/stats/model_libs.py')
sys.path.append('/Users/senzari/Machine_Learning/stats/src/stats/form_data.py')
sys.path.append('/Users/senzari/Machine_Learning/stats')
sys.path.append('/Users/senzari/Machine_Learning/stats/src/stats')
# print(sys.path)
os.chdir('/Users/senzari/Machine_Learning/stats/src')
#print(os.getcwd())
# Variables
round_number = 26

from stats import form_data, match_stats, model_libs, form_model, predict_matches

INITIALIZED...


In [2]:
# Reading in a CSV adds the first index column
training_data = pd.read_csv('round' + str(round_number) + '.csv')
training_data = training_data.drop(training_data.columns[[0]], axis=1)

target_col = 'points'
ignore_cols = ['match_id', 'team_id', 'team_name', 'opp_id', 'opp_name', 'scheduled']

td = model_libs._clone_and_drop(training_data, ignore_cols)
(y, X) = model_libs._extract_target(td, target_col)
print('Data Loaded...')

Data Loaded...


In [None]:
models = ['svc', 'gmm', 'kmeans', 'gnb']

def train_models(round_num):
    
    if os.path.isdir("/models/" + str(round_num)):
        print('Making New Directory for the Round')
        os.chdir('/Users/senzari/Machine_Learning/stats/src/models')
        os.makedirs(str(round_num))
        os.chdir('/Users/senzari/Machine_Learning/stats/src')
    
    for i in models:
        
        model_round = 'models/' + str(round_num) + '/' + str(i) + '_round_' + str(round_num) + '.pk1'
        
        if i == 'svc':
            svc = form_model.build_model(X, y, i)
            joblib.dump(svc, model_round)
        elif i == 'gmm':
            gmm = form_model.build_model(X, y, i)
            joblib.dump(gmm, model_round)
        elif i == 'kmeans':
            kmeans = form_model.build_model(X, y, i)
            joblib.dump(kmeans, model_round)
        elif i == 'gnb':
            gnb = form_model.build_model(X, y, i)
            joblib.dump(gnb, model_round)
        
    return svc, gmm, kmeans, gnb
        
        
def load_models(round_num):
    
    for i in models:
        model_round = 'models/' + str(round_num) + '/' + str(i) + '_round_' + str(round_num) + '.pk1'
        if i == 'svc':
             svc= joblib.load(model_round)
        elif i == 'gmm':
            gmm = joblib.load(model_round)
        elif i == 'kmeans':
            kmeans = joblib.load(model_round)
        elif i == 'gnb':
            gnb = joblib.load(model_round)
        
        print("Success :: Loaded - " + str(i))
        
    return svc, gmm, kmeans, gnb

# svc, gmm, kmeans, gnb = train_models(round_number)
svc, gmm, kmeans, gnb = load_models(round_number)

Find all matches

In [None]:
upcoming_matches, match_details = predict_matches.get_upcoming_matches()
print(upcoming_matches)

In [None]:
for i in models:
    if i == 'svc':
        svc_preds, upcoming_data = predict_matches.predictions(upcoming_matches, match_details, svc)
        print(svc_preds)
    elif i == 'gmm':
        gmm_preds, upcoming_data = predict_matches.predictions(upcoming_matches, match_details, gmm)
        print(gmm_preds)
    elif i == 'kmeans':
        kmeans_preds, upcoming_data = predict_matches.predictions(upcoming_matches, match_details, kmeans)
        print(kmeans_preds)
    elif i == 'gnb':
        gnb_preds, upcoming_data = predict_matches.predictions(upcoming_matches, match_details, gnb)
        print(gnb_preds)

Adds list of predictions to the upcoming matches and puts them in a CSV

In [None]:
columns = ['team_name', 'opp_name', 'scheduled']
# Remove all columns except the ones above
upcoming_matches = upcoming_data[columns]
# Add predictions to the end of that DF
results = pd.DataFrame({'SVC': svc_preds, 'GMM': gmm_preds, 'K-Means': kmeans_preds, 'GNB': gnb_preds})
upcoming_matches = upcoming_matches.join(results)
reordered_matches = pd.DataFrame([])

for rows in upcoming_matches.iterrows():
    for i in upcoming_matches['team_name']:
        if rows[1]['opp_name'] == i:
            reordered_matches = reordered_matches.append(rows[1])
            reordered_matches = reordered_matches.append(upcoming_matches[upcoming_matches['team_name'].isin([i])])

reordered_matches = reordered_matches.drop_duplicates() 
columns = ['scheduled', 'team_name', 'opp_name', 'SVC', 'K-Means', 'GMM', 'GNB']
reordered_matches = reordered_matches[columns]
# upcoming_matches = upcoming_matches[(upcoming_matches['scheduled'] < '2016-08-26')]
reordered_matches.to_csv('predictions_' + str(round_number) + '.csv')

Tune Models
Doesn't really do anything now, need to ask some questions to get better.  Maybe removing some features will help.

In [None]:
for i in models:
    form_model.build_tuned_model(X, y, i)

Trying to remove some useless features

In [None]:
print(training_data.shape)
reduced_data = form_data.variance_features(training_data)
print(reduced_data.shape)

In [None]:
svc = SVC(kernel="linear")
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2),
              scoring='accuracy')
rfecv.fit(X, y)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

# Need to normalize the data
"""norm_data = X.copy(deep=True)
norm_data = norm_data.apply(lambda x: (x - x.min()) / (x.max() - x.min()))
X_new = SelectKBest(chi2, k=50).fit_transform(norm_data, y)
print(X.shape)"""

In [4]:
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE

# Create the RFE object and rank each pixel
svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
rfe.fit(X, y)
print(rfe.ranking_)


[61  2 14 34 31  9 47 42 29  3  1 17 40  8 39 22 24 46 27 68 67 43 64 21 10
 49 28  6  4 20 38 63 57 53 32 25 50 75 76 54 65 44  7 51 23 13 12 30 36 62
 60 35 33 45 37  5 56 26 66 55 41 11 59 48 74 69 16 15 71 58 70 77 19 18 73
 52 72]


In [10]:
print(type(rfe.ranking_))
columns = ['rankings']
temp = pd.DataFrame(rfe.ranking_, columns=columns)
ranked_data = training_data.copy()
ranked_data = ranked_data.join(temp)
ranked_data = ranked_data.set_index(84)
#print(ranked_data.shape)


<type 'numpy.ndarray'>
    rankings
0         61
1          2
2         14
3         34
4         31
5          9
6         47
7         42
8         29
9          3
10         1
11        17
12        40
13         8
14        39
15        22
16        24
17        46
18        27
19        68
20        67
21        43
22        64
23        21
24        10
25        49
26        28
27         6
28         4
29        20
..       ...
47        30
48        36
49        62
50        60
51        35
52        33
53        45
54        37
55         5
56        56
57        26
58        66
59        55
60        41
61        11
62        59
63        48
64        74
65        69
66        16
67        15
68        71
69        58
70        70
71        77
72        19
73        18
74        73
75        52
76        72

[77 rows x 1 columns]
(448, 84)
(448, 85)


Let's try to reduce the number of features to at least 25 although optimal seems likely around ~8 features

In [None]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
reduced_data = sel.fit_transform(X)