In [1]:
import numpy as np # linear algebra
import pandas as pd # csv I/O

IMG_SIZE = 299
MODEL_NAME = 'xception'

DATA_DIR= r'D:\LICENTA\processed_data\size_{size1}x{size2}'.format(size1=IMG_SIZE, size2=IMG_SIZE)
FEATURES_PATH = DATA_DIR +  r'\train_biz_{name}_features.csv'.format(name=MODEL_NAME)

In [2]:
train_photos = pd.read_csv('train_photo_to_biz_ids.csv')
train_photo_to_biz = pd.read_csv('train_photo_to_biz_ids.csv', index_col='photo_id')

train_df = pd.read_csv(FEATURES_PATH)

X = train_df['feature_vector'].values
Y = train_df['label'].values

def convert_label_to_array(str_label):
    str_label = str_label[1:-1]
    str_label = str_label.split(',')
    return [int(x) for x in str_label if len(x) > 0]

def convert_feature_to_vector(str_feature):
    str_feature = str_feature[1:-1]
    str_feature = str_feature.split(',')
    return [float(x) for x in str_feature]

Y = np.array([convert_label_to_array(y) for y in train_df['label']])
X = np.array([convert_feature_to_vector(x) for x in train_df['feature_vector']])

In [3]:
print("X_train: ", X.shape)
print("y_train: ", Y.shape)
print("train_df:")
train_df[0:5]

X_train:  (1996, 2048)
y_train:  (1996,)
train_df:


Unnamed: 0,business,label,feature_vector
0,1000,"(1, 2, 3, 4, 5, 6, 7)","[0.98409426, 0.5011915, 0.80012208, 0.39773199..."
1,1001,"(0, 1, 6, 8)","[1.1223717, 0.39256608, 0.36077923, 1.2178262,..."
2,100,"(1, 2, 4, 5, 6, 7)","[1.0060079, 0.57109088, 0.84190941, 0.32770807..."
3,1006,"(1, 2, 4, 5, 6)","[1.1555356, 0.39558065, 0.63720721, 0.28982833..."
4,1010,"(0, 6, 8)","[0.98949295, 0.6631335, 0.60745454, 0.50947839..."


In [4]:
from sklearn import svm, preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.grid_search import GridSearchCV


mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(Y)

random_state = np.random.RandomState(0)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=random_state)

print('X_train: ',X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

X_train:  (1596, 2048)
X_test:  (400, 2048)
y_train:  (1596, 9)
y_test:  (400, 9)




In [5]:
%%time

clf = OneVsRestClassifier(svm.SVC(kernel='rbf'), n_jobs=-1)
X_train = preprocessing.scale(X_train)
clf.fit(X_train, y_train)

X_test = preprocessing.scale(X_test)
y_pred = clf.predict(X_test)


# parameters = {
#     "estimator__C": [1,2,4,8],
#     "estimator__kernel": ["poly","rbf"],
#     "estimator__degree":[1, 2, 3, 4],
# }

# model_tunning = GridSearchCV(clf, param_grid=parameters,
#                              scoring='f1_micro', n_jobs=-1)
# model_tunning.fit(X, y)


Wall time: 34.9 s


In [6]:
clf.score(X_test, y_test)

0.3175

In [8]:
# print(model_tunning.best_score_)
# print(model_tunning.best_params_)

from sklearn.metrics import f1_score

print('F1 Score: ', f1_score(y_test, y_pred, average='micro'))
print('F1 Individual Score: ', f1_score(y_test, y_pred, average=None))

F1 Score:  0.84980820301
F1 Individual Score:  [ 0.77777778  0.81538462  0.90537084  0.70076726  0.78640777  0.89711934
  0.93333333  0.76651982  0.91803279]


In [9]:
import pickle

with open(r'D:\LICENTA\models\svm_{size1}x{size2}\svm_{name}.pkl'.format(size1=IMG_SIZE, size2=IMG_SIZE, name=MODEL_NAME), 'wb') as f:
    pickle.dump(clf, f)

In [8]:
# testing with outside photos

from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
import os

# takes around 45 seconds
model = VGG16(weights='imagenet', include_top=False, pooling='max')

TEST_DIR = r'D:\Retro Test Photos'
        
features = []

for path in list(os.listdir(TEST_DIR)):
    img = image.load_img(os.path.join(TEST_DIR, path), target_size=(IMG_SIZE, IMG_SIZE))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    features.append(model.predict(img_array).reshape(512,))
    
test = np.mean(np.array(features), axis=0)

test = test.reshape(512,)
test = preprocessing.scale(test)
test = test.reshape(1, -1)
prediction = clf.predict(test)
print(prediction)

Using TensorFlow backend.


[[0 1 1 0 0 1 1 0 0]]


