In [1]:
#Relevant imports
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report as report

In [2]:
#Read the data from csv
data = pd.read_csv("preprocessed_datasets.csv")
data.head()

Unnamed: 0,id,PlayerName,DraftAge,Country,country_group,Height,Weight,Position,DraftYear,Overall,...,rs_PIM,rs_PlusMinus,po_GP,po_G,po_A,po_P,po_PIM,sum_7yr_GP,sum_7yr_TOI,GP_greater_than_0
0,9,David Bornhammar,18,Sweden,EURO,73,198,D,1999,192,...,44,0,0,0,0,0,0,0,0,no
1,27,Yared Hagos,18,Sweden,EURO,73,218,C,2001,70,...,24,1,10,4,1,5,4,0,0,no
2,30,Andreas Jamtin,18,Sweden,EURO,72,194,L,2001,157,...,155,2,0,0,0,0,0,0,0,no
3,58,Per Mars,19,Sweden,EURO,75,216,C,2001,87,...,60,-2,5,1,1,2,4,0,0,no
4,67,Daniel Fernholm,19,Sweden,EURO,76,229,D,2002,101,...,12,29,0,0,0,0,0,0,0,no


In [3]:
#Drop unnecessary columns
data = data.drop(['id','PlayerName','Country'], axis=1)
data.head()

Unnamed: 0,DraftAge,country_group,Height,Weight,Position,DraftYear,Overall,CSS_rank,rs_GP,rs_G,...,rs_PIM,rs_PlusMinus,po_GP,po_G,po_A,po_P,po_PIM,sum_7yr_GP,sum_7yr_TOI,GP_greater_than_0
0,18,EURO,73,198,D,1999,192,192,55,9,...,44,0,0,0,0,0,0,0,0,no
1,18,EURO,73,218,C,2001,70,24,43,11,...,24,1,10,4,1,5,4,0,0,no
2,18,EURO,72,194,L,2001,157,36,37,18,...,155,2,0,0,0,0,0,0,0,no
3,19,EURO,75,216,C,2001,87,176,29,6,...,60,-2,5,1,1,2,4,0,0,no
4,19,EURO,76,229,D,2002,101,26,12,5,...,12,29,0,0,0,0,0,0,0,no


In [4]:
data.dtypes

DraftAge              int64
country_group        object
Height                int64
Weight                int64
Position             object
DraftYear             int64
Overall               int64
CSS_rank              int64
rs_GP                 int64
rs_G                  int64
rs_A                  int64
rs_P                  int64
rs_PIM                int64
rs_PlusMinus          int64
po_GP                 int64
po_G                  int64
po_A                  int64
po_P                  int64
po_PIM                int64
sum_7yr_GP            int64
sum_7yr_TOI           int64
GP_greater_than_0    object
dtype: object

In [5]:
#Standardize the data
data_temp = data.drop(['country_group','Position','GP_greater_than_0'], axis=1)
scaler = MinMaxScaler()
scaler.fit(data_temp)
data_temp = pd.DataFrame(data=scaler.transform(data_temp), index=data_temp.index, columns=data_temp.columns)
data_temp.head()

Unnamed: 0,DraftAge,Height,Weight,DraftYear,Overall,CSS_rank,rs_GP,rs_G,rs_A,rs_P,rs_PIM,rs_PlusMinus,po_GP,po_G,po_A,po_P,po_PIM,sum_7yr_GP,sum_7yr_TOI
0,0.0,0.529412,0.396396,0.1,0.65411,0.767068,0.55,0.115385,0.054545,0.079787,0.091858,0.496403,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.529412,0.576577,0.3,0.236301,0.092369,0.43,0.141026,0.236364,0.196809,0.050104,0.503597,0.333333,0.181818,0.033333,0.108696,0.033333,0.0,0.0
2,0.0,0.470588,0.36036,0.3,0.534247,0.140562,0.37,0.230769,0.190909,0.207447,0.323591,0.510791,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.071429,0.647059,0.558559,0.3,0.294521,0.702811,0.29,0.076923,0.054545,0.06383,0.125261,0.482014,0.166667,0.045455,0.033333,0.043478,0.033333,0.0,0.0
4,0.071429,0.705882,0.675676,0.4,0.342466,0.100402,0.12,0.064103,0.118182,0.095745,0.025052,0.705036,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
datac_temp = data[['country_group','Position']]
datac_temp = pd.get_dummies(datac_temp, prefix=['CG','Position'])
datac_temp.head()

Unnamed: 0,CG_CAN,CG_EURO,CG_USA,Position_C,Position_D,Position_L,Position_R
0,0,1,0,0,1,0,0
1,0,1,0,1,0,0,0
2,0,1,0,0,0,1,0
3,0,1,0,1,0,0,0
4,0,1,0,0,1,0,0


In [7]:
def label(index):
    if index == 'no':
        return 0
    else:
        return 1

In [8]:
data_label = data['GP_greater_than_0'].apply(label)
data_label.head()

0    0
1    0
2    0
3    0
4    0
Name: GP_greater_than_0, dtype: int64

In [9]:
l1=data_temp.values.tolist()
l2=datac_temp.values.tolist()
for i in range(len(l1)):
    l1[i].extend(l2[i])

df=pd.DataFrame(l1,columns=data_temp.columns.tolist()+datac_temp.columns.tolist())
df.head()

Unnamed: 0,DraftAge,Height,Weight,DraftYear,Overall,CSS_rank,rs_GP,rs_G,rs_A,rs_P,...,po_PIM,sum_7yr_GP,sum_7yr_TOI,CG_CAN,CG_EURO,CG_USA,Position_C,Position_D,Position_L,Position_R
0,0.0,0.529412,0.396396,0.1,0.65411,0.767068,0.55,0.115385,0.054545,0.079787,...,0.0,0.0,0.0,0,1,0,0,1,0,0
1,0.0,0.529412,0.576577,0.3,0.236301,0.092369,0.43,0.141026,0.236364,0.196809,...,0.033333,0.0,0.0,0,1,0,1,0,0,0
2,0.0,0.470588,0.36036,0.3,0.534247,0.140562,0.37,0.230769,0.190909,0.207447,...,0.0,0.0,0.0,0,1,0,0,0,1,0
3,0.071429,0.647059,0.558559,0.3,0.294521,0.702811,0.29,0.076923,0.054545,0.06383,...,0.033333,0.0,0.0,0,1,0,1,0,0,0
4,0.071429,0.705882,0.675676,0.4,0.342466,0.100402,0.12,0.064103,0.118182,0.095745,...,0.0,0.0,0.0,0,1,0,0,1,0,0


In [10]:
df['label'] = data_label
df.head()

Unnamed: 0,DraftAge,Height,Weight,DraftYear,Overall,CSS_rank,rs_GP,rs_G,rs_A,rs_P,...,sum_7yr_GP,sum_7yr_TOI,CG_CAN,CG_EURO,CG_USA,Position_C,Position_D,Position_L,Position_R,label
0,0.0,0.529412,0.396396,0.1,0.65411,0.767068,0.55,0.115385,0.054545,0.079787,...,0.0,0.0,0,1,0,0,1,0,0,0
1,0.0,0.529412,0.576577,0.3,0.236301,0.092369,0.43,0.141026,0.236364,0.196809,...,0.0,0.0,0,1,0,1,0,0,0,0
2,0.0,0.470588,0.36036,0.3,0.534247,0.140562,0.37,0.230769,0.190909,0.207447,...,0.0,0.0,0,1,0,0,0,1,0,0
3,0.071429,0.647059,0.558559,0.3,0.294521,0.702811,0.29,0.076923,0.054545,0.06383,...,0.0,0.0,0,1,0,1,0,0,0,0
4,0.071429,0.705882,0.675676,0.4,0.342466,0.100402,0.12,0.064103,0.118182,0.095745,...,0.0,0.0,0,1,0,0,1,0,0,0


In [11]:
#Split the data into training and testing dataset
x_data = df.drop(['label'], axis=1)
y_data = df['label']
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3)

In [12]:
#Create feature columns for the estimator
feat_cols = []
for col in df.columns:
    if(col != 'label'):
        col = tf.feature_column.numeric_column(col)
        feat_cols.append(col)

In [13]:
#Create input function for training the model
train_input_func = tf.estimator.inputs.pandas_input_fn(x=x_train, y=y_train, batch_size=10, num_epochs=None, shuffle=True)
#Create input function for testing the model
test_input_func = tf.estimator.inputs.pandas_input_fn(x=x_test, y=y_test, batch_size=10, num_epochs=1, shuffle=False)

In [14]:
#Declare the model
model = tf.estimator.LinearClassifier(feature_columns=feat_cols, n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x108496dd0>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/var/folders/js/vq_cts194qscfxyy3tzq_lg80000gn/T/tmphvVBKO', '_global_id_in_cluster': 0, '_save_summary_steps': 100}


In [15]:
#Train the model
model.train(input_fn=train_input_func, steps=50000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/js/vq_cts194qscfxyy3tzq_lg80000gn/T/tmphvVBKO/model.ckpt.
INFO:tensorflow:loss = 6.931472, step = 1
INFO:tensorflow:global_step/sec: 203.546
INFO:tensorflow:loss = 5.1384387, step = 101 (0.495 sec)
INFO:tensorflow:global_step/sec: 246.092
INFO:tensorflow:loss = 3.9531603, step = 201 (0.405 sec)
INFO:tensorflow:global_step/sec: 243.766
INFO:tensorflow:loss = 3.2517834, step = 301 (0.410 sec)
INFO:tensorflow:global_step/sec: 260.784
INFO:tensorflow:loss = 3.5686543, step = 401 (0.386 sec)
INFO:tensorflow:global_step/sec: 267.185
INFO:tensorflow:loss = 2.9518707, step = 501 (0.373 sec)
INFO:tensorflow:global_step/sec: 278.023
INFO:tensorflow:loss = 3.6096687, step = 601 (0.360 sec)
INFO:tensorflow:

INFO:tensorflow:loss = 2.2632427, step = 8001 (0.351 sec)
INFO:tensorflow:global_step/sec: 280.786
INFO:tensorflow:loss = 2.11336, step = 8101 (0.357 sec)
INFO:tensorflow:global_step/sec: 281.199
INFO:tensorflow:loss = 2.32204, step = 8201 (0.355 sec)
INFO:tensorflow:global_step/sec: 277.375
INFO:tensorflow:loss = 1.7054172, step = 8301 (0.361 sec)
INFO:tensorflow:global_step/sec: 270.795
INFO:tensorflow:loss = 3.4270554, step = 8401 (0.368 sec)
INFO:tensorflow:global_step/sec: 287.136
INFO:tensorflow:loss = 3.7613347, step = 8501 (0.350 sec)
INFO:tensorflow:global_step/sec: 277.819
INFO:tensorflow:loss = 4.600082, step = 8601 (0.359 sec)
INFO:tensorflow:global_step/sec: 275.125
INFO:tensorflow:loss = 2.0828853, step = 8701 (0.364 sec)
INFO:tensorflow:global_step/sec: 285.929
INFO:tensorflow:loss = 1.7328603, step = 8801 (0.351 sec)
INFO:tensorflow:global_step/sec: 283.022
INFO:tensorflow:loss = 3.4274726, step = 8901 (0.351 sec)
INFO:tensorflow:global_step/sec: 280.613
INFO:tensorflow

INFO:tensorflow:global_step/sec: 212.785
INFO:tensorflow:loss = 1.6767844, step = 16301 (0.470 sec)
INFO:tensorflow:global_step/sec: 203.919
INFO:tensorflow:loss = 1.4962233, step = 16401 (0.491 sec)
INFO:tensorflow:global_step/sec: 197.336
INFO:tensorflow:loss = 1.4404689, step = 16501 (0.506 sec)
INFO:tensorflow:global_step/sec: 202.633
INFO:tensorflow:loss = 1.175769, step = 16601 (0.493 sec)
INFO:tensorflow:global_step/sec: 217.09
INFO:tensorflow:loss = 0.8036569, step = 16701 (0.465 sec)
INFO:tensorflow:global_step/sec: 187.697
INFO:tensorflow:loss = 1.0249836, step = 16801 (0.532 sec)
INFO:tensorflow:global_step/sec: 233.949
INFO:tensorflow:loss = 0.4883688, step = 16901 (0.427 sec)
INFO:tensorflow:global_step/sec: 179.16
INFO:tensorflow:loss = 2.892359, step = 17001 (0.560 sec)
INFO:tensorflow:global_step/sec: 214.177
INFO:tensorflow:loss = 5.205534, step = 17101 (0.463 sec)
INFO:tensorflow:global_step/sec: 242.277
INFO:tensorflow:loss = 1.6586653, step = 17201 (0.412 sec)
INFO:

INFO:tensorflow:loss = 1.5956633, step = 24501 (0.351 sec)
INFO:tensorflow:global_step/sec: 291.53
INFO:tensorflow:loss = 0.7866934, step = 24601 (0.343 sec)
INFO:tensorflow:global_step/sec: 277.352
INFO:tensorflow:loss = 3.5767226, step = 24701 (0.365 sec)
INFO:tensorflow:global_step/sec: 269.502
INFO:tensorflow:loss = 1.204581, step = 24801 (0.369 sec)
INFO:tensorflow:global_step/sec: 279.701
INFO:tensorflow:loss = 1.6075467, step = 24901 (0.357 sec)
INFO:tensorflow:global_step/sec: 278.708
INFO:tensorflow:loss = 1.160523, step = 25001 (0.360 sec)
INFO:tensorflow:global_step/sec: 219.162
INFO:tensorflow:loss = 1.8862835, step = 25101 (0.457 sec)
INFO:tensorflow:global_step/sec: 280.515
INFO:tensorflow:loss = 1.2257597, step = 25201 (0.354 sec)
INFO:tensorflow:global_step/sec: 238.435
INFO:tensorflow:loss = 2.9591067, step = 25301 (0.419 sec)
INFO:tensorflow:global_step/sec: 222.702
INFO:tensorflow:loss = 2.6329117, step = 25401 (0.449 sec)
INFO:tensorflow:global_step/sec: 251.622
INF

INFO:tensorflow:global_step/sec: 263.375
INFO:tensorflow:loss = 3.0957227, step = 32801 (0.382 sec)
INFO:tensorflow:global_step/sec: 271.442
INFO:tensorflow:loss = 2.071766, step = 32901 (0.370 sec)
INFO:tensorflow:global_step/sec: 259.101
INFO:tensorflow:loss = 1.027367, step = 33001 (0.382 sec)
INFO:tensorflow:global_step/sec: 275.276
INFO:tensorflow:loss = 1.0669597, step = 33101 (0.365 sec)
INFO:tensorflow:global_step/sec: 262.087
INFO:tensorflow:loss = 0.7903602, step = 33201 (0.380 sec)
INFO:tensorflow:global_step/sec: 266.287
INFO:tensorflow:loss = 3.1511831, step = 33301 (0.376 sec)
INFO:tensorflow:global_step/sec: 252.358
INFO:tensorflow:loss = 1.9218242, step = 33401 (0.396 sec)
INFO:tensorflow:global_step/sec: 203.533
INFO:tensorflow:loss = 2.9301152, step = 33501 (0.493 sec)
INFO:tensorflow:global_step/sec: 205.743
INFO:tensorflow:loss = 1.1509694, step = 33601 (0.485 sec)
INFO:tensorflow:global_step/sec: 254.171
INFO:tensorflow:loss = 1.8457675, step = 33701 (0.392 sec)
IN

INFO:tensorflow:loss = 2.1218364, step = 41001 (0.359 sec)
INFO:tensorflow:global_step/sec: 280.883
INFO:tensorflow:loss = 2.6311734, step = 41101 (0.360 sec)
INFO:tensorflow:global_step/sec: 276.683
INFO:tensorflow:loss = 0.9028235, step = 41201 (0.361 sec)
INFO:tensorflow:global_step/sec: 293.919
INFO:tensorflow:loss = 0.9736526, step = 41301 (0.340 sec)
INFO:tensorflow:global_step/sec: 277.384
INFO:tensorflow:loss = 0.79450655, step = 41401 (0.356 sec)
INFO:tensorflow:global_step/sec: 267.382
INFO:tensorflow:loss = 1.2251874, step = 41501 (0.374 sec)
INFO:tensorflow:global_step/sec: 267.953
INFO:tensorflow:loss = 1.4333739, step = 41601 (0.375 sec)
INFO:tensorflow:global_step/sec: 279.136
INFO:tensorflow:loss = 2.37776, step = 41701 (0.356 sec)
INFO:tensorflow:global_step/sec: 282.619
INFO:tensorflow:loss = 3.172154, step = 41801 (0.358 sec)
INFO:tensorflow:global_step/sec: 285.636
INFO:tensorflow:loss = 0.8508252, step = 41901 (0.348 sec)
INFO:tensorflow:global_step/sec: 276.421
IN

INFO:tensorflow:global_step/sec: 270.558
INFO:tensorflow:loss = 1.9003106, step = 49301 (0.369 sec)
INFO:tensorflow:global_step/sec: 265.923
INFO:tensorflow:loss = 3.3646786, step = 49401 (0.378 sec)
INFO:tensorflow:global_step/sec: 266.817
INFO:tensorflow:loss = 2.637095, step = 49501 (0.376 sec)
INFO:tensorflow:global_step/sec: 269.939
INFO:tensorflow:loss = 1.0800407, step = 49601 (0.368 sec)
INFO:tensorflow:global_step/sec: 263.131
INFO:tensorflow:loss = 3.685968, step = 49701 (0.382 sec)
INFO:tensorflow:global_step/sec: 238.993
INFO:tensorflow:loss = 1.7144134, step = 49801 (0.418 sec)
INFO:tensorflow:global_step/sec: 261.26
INFO:tensorflow:loss = 0.89663213, step = 49901 (0.380 sec)
INFO:tensorflow:Saving checkpoints for 50000 into /var/folders/js/vq_cts194qscfxyy3tzq_lg80000gn/T/tmphvVBKO/model.ckpt.
INFO:tensorflow:Loss for final step: 0.92086405.


<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x118a01950>

In [16]:
#Evaluate the model on testing dataset
model.evaluate(input_fn=test_input_func)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-05-31-17:43:32
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/js/vq_cts194qscfxyy3tzq_lg80000gn/T/tmphvVBKO/model.ckpt-50000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-05-31-17:43:33
INFO:tensorflow:Saving dict for global step 50000: accuracy = 0.8742515, accuracy_baseline = 0.5778443, auc = 0.9538088, auc_precision_recall = 0.95502615, average_loss = 0.27915654, global_step = 50000, label/mean = 0.42215568, loss = 2.7832322, precision = 1.0, prediction/mean = 0.3912886, recall = 0.70212764


{'accuracy': 0.8742515,
 'accuracy_baseline': 0.5778443,
 'auc': 0.9538088,
 'auc_precision_recall': 0.95502615,
 'average_loss': 0.27915654,
 'global_step': 50000,
 'label/mean': 0.42215568,
 'loss': 2.7832322,
 'precision': 1.0,
 'prediction/mean': 0.3912886,
 'recall': 0.70212764}

In [17]:
#Creating input function for predictions
pred_input_func = tf.estimator.inputs.pandas_input_fn(x=x_test, batch_size=10, num_epochs=1, shuffle=False)

In [18]:
#Making predictions
predictions = list (model.predict(input_fn=pred_input_func))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/js/vq_cts194qscfxyy3tzq_lg80000gn/T/tmphvVBKO/model.ckpt-50000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [19]:
predictions

[{'class_ids': array([1]),
  'classes': array(['1'], dtype=object),
  'logistic': array([0.99999964], dtype=float32),
  'logits': array([14.906405], dtype=float32),
  'probabilities': array([3.3591564e-07, 9.9999964e-01], dtype=float32)},
 {'class_ids': array([1]),
  'classes': array(['1'], dtype=object),
  'logistic': array([1.], dtype=float32),
  'logits': array([24.612413], dtype=float32),
  'probabilities': array([2.046278e-11, 1.000000e+00], dtype=float32)},
 {'class_ids': array([0]),
  'classes': array(['0'], dtype=object),
  'logistic': array([0.06434713], dtype=float32),
  'logits': array([-2.6769524], dtype=float32),
  'probabilities': array([0.93565285, 0.06434712], dtype=float32)},
 {'class_ids': array([1]),
  'classes': array(['1'], dtype=object),
  'logistic': array([1.], dtype=float32),
  'logits': array([20.46001], dtype=float32),
  'probabilities': array([1.3011588e-09, 1.0000000e+00], dtype=float32)},
 {'class_ids': array([1]),
  'classes': array(['1'], dtype=object),


In [20]:
list_predictions = []
for pred in predictions:
    list_predictions.append(pred['class_ids'][0])
#Predictions corresponding to first five records
list_predictions[:5]

[1, 1, 0, 1, 1]

In [21]:
#Classification Evaluation Metrics
print(report(y_test, list_predictions))

             precision    recall  f1-score   support

          0       0.82      1.00      0.90       386
          1       1.00      0.70      0.82       282

avg / total       0.90      0.87      0.87       668

