In [1]:
from keras.models import Model, Sequential
from keras.layers import Dense, Dropout
import numpy
import pandas

Using TensorFlow backend.


In [2]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, QuantileTransformer 

#### Prepare data

In [3]:
trainmatchups = pandas.read_csv('kerasfiles/train.matchups.ODstats.csv')
print(trainmatchups.shape)
traindata = trainmatchups.values

(25844, 133)


In [4]:
traindata = trainmatchups.values

trainX = traindata[:,1:]
trainY = traindata[:,0]

print(trainX.shape)
print(trainY.shape)

input_shape = trainX.shape[1]

(25844, 132)
(25844,)


#### Create Model

In [5]:
def create_model(dropout=0.2,kernel_initializer='normal'):
    model = Sequential()
    model.add(Dense(256, kernel_initializer=kernel_initializer, activation='relu', input_shape=(input_shape,)))
    model.add(Dropout(dropout))
    model.add(Dense(128, kernel_initializer=kernel_initializer, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(64, kernel_initializer=kernel_initializer, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(16, kernel_initializer=kernel_initializer, activation='relu'))
    model.add(Dense(1, kernel_initializer=kernel_initializer, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [6]:
model = create_model(0.1)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               34048     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 16)                1040      
__________

#### Prepare data

In [17]:
#trainX_scaled = trainX
#scaler = StandardScaler()
#trainX_scaled = scaler.fit_transform(trainX)
#scaler = RobustScaler()
#trainX_scaled = scaler.fit_transform(trainX)
#scaler = PowerTransformer()
#trainX_scaled = scaler.fit_transform(trainX)
scaler = QuantileTransformer()
trainX_scaled = scaler.fit_transform(trainX)

#### Train Model

In [18]:
model.fit(trainX_scaled, trainY, epochs=50, batch_size=500, verbose=0)

<keras.callbacks.History at 0x7f252651b7b8>

In [19]:
scores = model.evaluate(trainX_scaled, trainY)
print("\n%s: %.2f" % (model.metrics_names[0], scores[0]))
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


loss: 0.32

acc: 86.14%


#### Load test dataset

In [20]:
testmatchups = pandas.read_csv('kerasfiles/test.matchups.ODstats.csv')
print(testmatchups.shape)

(11390, 135)


In [21]:
testdata = testmatchups.values
testdata.shape

(11390, 135)

In [22]:
testdata = testmatchups.values

testID = testdata[:,0]                # ID for submission
testT = testdata[:,1].astype(int)     # played matchups during tourneys
testY = testdata[:,2].astype(int)     # truth (0 or 1) if played, 0 otherwise 
testX = testdata[:,3:].astype(float)  # features

print(testID.shape)
print(testT.shape)
print(testX.shape)
print(testY.shape)

(11390,)
(11390,)
(11390, 132)
(11390,)


In [23]:
# scale
testX_scaled = scaler.transform(testX)

#### Predict test

In [24]:
# predict all matchups
Yhat = model.predict_classes(testX_scaled)
Yprob = model.predict_proba(testX_scaled)

In [25]:
# evaluate only played matchups 
scores = model.evaluate(testX_scaled[testT==1,:], testY[testT==1])
print("\n%s: %.2f" % (model.metrics_names[0], scores[0]))
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


loss: 0.63

acc: 62.99%


In [26]:
# confusion matrix for played matchups
matrix = confusion_matrix(testY[testT==1], Yhat[testT==1])
matrix

array([[116,  57],
       [ 67,  95]])

#### Create submission file

In [None]:
pandas.DataFrame({'ID':testID,'Pred':Yprob.reshape((11390,))}).to_csv('predictions/Pred_NN_Dense.csv',index=False)