In [1]:
import numpy as np

In [2]:
import os

# Transfer learning 

We start with two folders containing positive and negative samples for stadiums, due to the nature of the problem, classes are highly unbalanced. 

In [7]:
stadium_images = os.listdir('stadiums/')

In [8]:
non_stadium_images = os.listdir('non_stadiums/')

In [9]:
len(non_stadium_images)

11972

In [10]:
len(stadium_images)

47

In [11]:
import pandas as pd
stadium_df = pd.DataFrame()
stadium_df['filename'] = stadium_images
stadium_df['is_stadium'] = True

In [12]:
non_stadium_df = pd.DataFrame()
non_stadium_df['filename'] = non_stadium_images
non_stadium_df['is_stadium'] = False

In [13]:
dataset_df = pd.concat([non_stadium_df,stadium_df])

In [14]:
dataset_df = dataset_df.drop(0)

We'll use resnet50 to extract image features from the positives and negative samples. Then use those features as input
to a single layer network with dropout for classification

Let's try out feature extraction for a single image

In [16]:
from keras.applications import resnet50
from keras.applications.resnet50 import preprocess_input
from keras.preprocessing import image


In [19]:
model = resnet50.ResNet50(weights='imagenet', include_top=False)
#model.summary()

img_path = 'stadiums/Estadio Alberto J. Armando.png'
img = image.load_img(img_path, target_size=(256, 256))
img_data = image.img_to_array(img)
img_data = np.expand_dims(img_data, axis=0)
img_data = preprocess_input(img_data)

resnet_features = model.predict(img_data)
print(resnet_features)
print(resnet_features.shape)

[[[[ 0.08236177  0.05985396  0.08081593 ...,  0.16119762  0.81385475
     0.42690596]]]]
(1, 1, 1, 2048)


an image is then represented by an array of 2048 floats taken from the output of the resnet. Let's extract 
features for the rest of the stadium images

In [20]:
image_features = []
for x in stadium_images:
    if x.endswith('.png'):
        img = image.load_img('stadiums/'+x, target_size=(256, 256))
        img_data = image.img_to_array(img)
        img_data = np.expand_dims(img_data, axis=0)
        img_data = preprocess_input(img_data)
        features = model.predict(img_data)
        image_features.append(features.flatten())

In [14]:
stadium_features = image_features

as a first approach to tackle class imbalance we undersample the negative class

In [22]:
import random
non_stadium_features = []
for x in random.sample(non_stadium_images,1000):
    if x.endswith('.jpg'):
        img = image.load_img('non_stadiums//'+x, target_size=(256, 256))
        img_data = image.img_to_array(img)
        img_data = np.expand_dims(img_data, axis=0)
        img_data = preprocess_input(img_data)
        features = model.predict(img_data)
        non_stadium_features.append(features.flatten())

In [16]:
len(non_stadium_features[0])

2048

In [21]:
# simple NN to train with static features

In [75]:
from keras import models
from keras import layers
from keras import optimizers
model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_dim=2048))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer=optimizers.RMSprop(lr=2e-5),
              loss='binary_crossentropy',
              metrics=['acc'])

In [76]:
from sklearn.model_selection import train_test_split


In [77]:
features = np.concatenate((np.array(stadium_features),np.array(non_stadium_features)))
labels = np.concatenate((np.full(len(stadium_features),1),np.zeros(len(non_stadium_features))))


In [78]:
len(stadium_features)

45

In [79]:
len(features)

1043

In [80]:
#val_choices = np.random.choice(np.arange(len(features)),int(len(features)*0.2),replace=False)
#validation_features = features[val_choices]
#validation_labels = labels[val_choices]
train_features,val_features,train_labels,val_labels = train_test_split(features,labels,test_size=0.3)

In [81]:
print(len(val_features))

313


In [82]:
history = model.fit(x=train_features,y=train_labels,
                    epochs=30,
                    batch_size=20,
                    validation_split=0.5)




Train on 365 samples, validate on 365 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [83]:
sum(train_labels)

28.0

In [84]:
sum(val_labels)

17.0

In [85]:
predictions = model.predict_proba(val_features)

In [86]:
results = list(map(lambda x,y: (x,y),predictions,val_labels))

In [87]:
import pandas as pd
results_df = pd.DataFrame.from_records(results)
results_df.columns=['prediction','label']

In [88]:
from sklearn.metrics import roc_auc_score

In [89]:
from sklearn.metrics import confusion_matrix

In [90]:
confusion_matrix(y_true = results_df['label'],y_pred=results_df['prediction']>0.5,labels=[0.0,1.0])

array([[296,   0],
       [  4,  13]])

In [91]:
roc_auc_score(results_df['label'],results_df['prediction'])

0.99761526232114461

In [92]:
from sklearn.metrics import f1_score,auc

In [93]:
from sklearn.metrics import roc_curve

In [94]:
%matplotlib inline
fpr_keras, tpr_keras, thresholds_keras = roc_curve(results_df['label'],results_df['prediction'])

In [95]:
thresholds_keras

array([array([ 0.94690043], dtype=float32),
       array([ 0.35139912], dtype=float32),
       array([ 0.09173584], dtype=float32),
       array([ 0.07516082], dtype=float32),
       array([ 0.04373168], dtype=float32),
       array([ 0.04350336], dtype=float32),
       array([  1.69703731e-06], dtype=float32)], dtype=object)

In [96]:
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
plt.plot(fpr_rf, tpr_rf, label='RF (area = {:.3f})'.format(auc_rf))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC cuddrve')
plt.legend(loc='best')
plt.show()
# Zoom in view of the upper left corner.
plt.figure(2)
plt.xlim(0, 0.2)
plt.ylim(0.8, 1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
plt.plot(fpr_rf, tpr_rf, label='RF (area = {:.3f})'.format(auc_rf))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve (zoomed in at top left)')
plt.legend(loc='best')
plt.show()

TypeError: 'module' object is not callable

In [97]:
model.save('resnet+dense.hdf5')