In [45]:
import numpy as np # linear algebra
import pandas as pd # csv I/O

IMG_SIZE = 299
MODEL_NAME = 'xception'

DATA_DIR= r'D:\LICENTA\processed_data\size_{size1}x{size2}'.format(size1=IMG_SIZE, size2=IMG_SIZE)
FEATURES_PATH = DATA_DIR +  r'\train_biz_{name}_features.csv'.format(name=MODEL_NAME)

In [46]:
train_photos = pd.read_csv('train_photo_to_biz_ids.csv')
train_photo_to_biz = pd.read_csv('train_photo_to_biz_ids.csv', index_col='photo_id')

train_df = pd.read_csv(FEATURES_PATH)

X = train_df['feature_vector'].values
Y = train_df['label'].values

def convert_label_to_array(str_label):
    str_label = str_label[1:-1]
    str_label = str_label.split(',')
    return [int(x) for x in str_label if len(x) > 0]

def convert_feature_to_vector(str_feature):
    str_feature = str_feature[1:-1]
    str_feature = str_feature.split(',')
    return [float(x) for x in str_feature]

Y = np.array([convert_label_to_array(y) for y in train_df['label']])
X = np.array([convert_feature_to_vector(x) for x in train_df['feature_vector']])

In [47]:
print("X_train: ", X.shape)
print("y_train: ", Y.shape)
print("train_df:")
train_df[0:5]

X_train:  (1996, 2048)
y_train:  (1996,)
train_df:


Unnamed: 0,business,label,feature_vector
0,1000,"(1, 2, 3, 4, 5, 6, 7)","[0.12353174, 0.056759115, 0.10699466, 0.045236..."
1,1001,"(0, 1, 6, 8)","[0.19606702, 0.030776627, 0.032012787, 0.21252..."
2,100,"(1, 2, 4, 5, 6, 7)","[0.13964602, 0.059296891, 0.10956762, 0.041540..."
3,1006,"(1, 2, 4, 5, 6)","[0.16115737, 0.023083402, 0.10019045, 0.051235..."
4,1010,"(0, 6, 8)","[0.12933584, 0.086438626, 0.062900275, 0.05014..."


In [48]:
from sklearn import svm, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(Y)

random_state = np.random.RandomState(0)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=random_state)

print('X_train: ',X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

X_train:  (1596, 2048)
X_test:  (400, 2048)
y_train:  (1596, 9)
y_test:  (400, 9)


In [49]:
%%time
from sklearn.multioutput import MultiOutputClassifier

clf = MultiOutputClassifier(svm.SVC(kernel='rbf', C=2.6), n_jobs=-1)
X_train = preprocessing.scale(X_train)
clf.fit(X_train, y_train)

X_test = preprocessing.scale(X_test)
y_pred = clf.predict(X_test)

Wall time: 33.3 s


In [51]:
from sklearn.metrics import f1_score

print('F1 Score: ', f1_score(y_test, y_pred, average='micro'))
print('F1 Individual Score: ', f1_score(y_test, y_pred, average=None))

F1 Score:  0.854374633001
F1 Individual Score:  [ 0.77205882  0.81632653  0.90076336  0.71611253  0.784689    0.90909091
  0.93900185  0.7699115   0.92402464]


In [39]:
%%time
from sklearn.multioutput import ClassifierChain

X_train = preprocessing.scale(X_train)
chains = [ClassifierChain(svm.SVC(kernel='rbf', C=2.6), order='random', random_state=i)
          for i in range(1)]
for chain in chains:
    chain.fit(X_train, y_train)
    
X_test = preprocessing.scale(X_test)

y_pred_chains = np.array([chain.predict(X_test) for chain in
                          chains], dtype=object)

y_pred = y_pred_chains.mean(axis=0)
y_pred = y_pred.astype(int)

Wall time: 50.5 s


In [None]:
clf.score(X_test, y_test)

In [40]:
from sklearn.metrics import f1_score

print('F1 Score: ', f1_score(y_test, y_pred, average='micro'))
print('F1 Individual Score: ', f1_score(y_test, y_pred, average=None))

F1 Score:  0.81029496819
F1 Individual Score:  [ 0.65942029  0.78712871  0.87128713  0.64321608  0.75454545  0.85887097
  0.93333333  0.71729958  0.88612836]


In [52]:
import pickle

with open(r'D:\LICENTA\models\svm_{size1}x{size2}\svm_{name}.pkl'.format(size1=IMG_SIZE, size2=IMG_SIZE, name=MODEL_NAME), 'wb') as f:
    pickle.dump(clf, f)

In [None]:
# testing with outside photos

from keras.applications.xception import Xception
from keras.preprocessing import image
from keras.applications.xception import preprocess_input
import os

model = Xception(weights='imagenet', include_top=False, pooling='max')

TEST_DIR = r'D:\Retro Test Photos'
        
features = []

for path in list(os.listdir(TEST_DIR)):
    img = image.load_img(os.path.join(TEST_DIR, path), target_size=(IMG_SIZE, IMG_SIZE))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    features.append(model.predict(img_array).reshape(2048,))
    
test = np.mean(np.array(features), axis=0)

test = test.reshape(2048,)
test = preprocessing.scale(test)
test = test.reshape(1, -1)
prediction = clf.predict(test)
print(prediction)