In [1]:
#prepare low-quality sample list and delete them from training dataset
#准备低质量的样本清单并进行剔除

import pandas as pd

#加载探索过程中的记录文件
train_dimensions = pd.DataFrame(pd.read_csv("train_dimension.csv"))
train_labels_df = pd.DataFrame(pd.read_csv("train_label.csv"),columns = ['train_file','predict'])
train_labels = train_labels_df.values.tolist()

#建立低质量样本清单
sample_list_low_quality=[]

#identify samples which height/width is smaller than 100
#识别清晰度较低的图片样本
sample_list_low_quality.extend(train_dimensions[train_dimensions.width<100]['train_file'].tolist())
sample_list_low_quality.extend(train_dimensions[train_dimensions.height<100]['train_file'].tolist())

#identify samples with ambigious lablelling
#识别标签模糊的图片样本
def label_train_file(train_file_name):
    if train_file_name.find('cat')>=0:
        return 'cat'
    else:
        return 'dog'

def label_predict(predict):
    if predict.find('犬')>=0 or predict.find('狗')>=0 or predict.find('㹴')>=0 or predict.find('梗')>=0 or predict.find('獒')>=0 or predict.find('拉布拉多')>=0 or predict.find('可卡')>=0 or predict.find('柯基')>=0 or predict.find('雪纳瑞')>=0 or predict.find('大丹')>=0 or predict.find('喜乐蒂')>=0 or predict.find('博美')>=0 or predict.find('大白熊')>=0 or predict.find('马尔济斯')>=0 or predict.find('京巴')>=0 or predict.find('大麦町')>=0 or predict.find('比格')>=0 or predict.find('日本狆')>=0 or predict.find('萨摩耶')>=0 or predict.find('中华狼青')>=0 or predict.find('藏狮')>=0 or predict.find('巴西非勒')>=0 or predict.find('奥斯卡贵宾')>=0 or predict.find('郊狼')>=0 or predict.find('雪贵宾')>=0:
        return 'dog'
    elif predict.find('猫')>=0:
        return 'cat'
    else:
        return 'tbd'

for train_label in train_labels:
    train_file_name=train_label[0]
    predict=train_label[1]
    train_file_label=label_train_file(train_file_name)
    predict_label=label_predict(predict)
    if train_file_label!=predict_label:
        sample_list_low_quality.append(train_file_name)

#remove duplicate in the low quality sample list
#对list进行去重处理
sample_list_low_quality = list(set(sample_list_low_quality))
print('Low quality sample number: '+str(len(sample_list_low_quality)))

#Load training dataset
#加载训练图片数据集
from sklearn.datasets import load_files     
data = load_files('train')
import numpy as np
from keras.utils import np_utils
train_files = np.array(data['filenames'])
train_targets = np_utils.to_categorical(np.array(data['target']), 2)   

#exclude low quality samples from the dataset
#从训练集中剔除低质量样本
train_files_final=[]
train_targets_final=[]
for train_file,train_target in zip(train_files,train_targets):
    if train_file not in sample_list_low_quality:
        train_files_final.append(train_file)
        train_targets_final.append(train_target)
print('final train file number: '+str(len(train_targets_final)))

Low quality sample number: 711


Using TensorFlow backend.


final train file number: 24289


In [1]:
#function to extract features given image path
from keras.preprocessing import image   
import numpy as np
from keras.applications.nasnet import NASNetLarge, preprocess_input

pretrained_model=NASNetLarge(include_top=False, weights='imagenet',pooling='max',input_shape = (331, 331, 3))

#convert image to tensors
def path_to_tensor(img_path):
    img = image.load_img(img_path, target_size=(331, 331))
    x = image.img_to_array(img)
    return np.expand_dims(x, axis=0)

#extract features
def extract_features(train_file):
    tensor=path_to_tensor(train_file)
    features=pretrained_model.predict(preprocess_input(tensor))
    return features.tolist()[0]

Using TensorFlow backend.


In [3]:
#extract features for each images in the final training set
from ipywidgets import IntProgress
from IPython.display import display
import time

p = IntProgress()
progressFull=len(train_files_final)
display(p)

train_features_final=[]
for train_file in train_files_final:
    features=extract_features(train_file)
    train_features_final.append(features)
    p.value = float(len(train_features_final))/float(progressFull)*100
    p.description = str(len(train_features_final))+'/'+str(progressFull)

IntProgress(value=0)

In [4]:
#store extracted features into csv files
import pandas as pd
train_features_final_df=pd.DataFrame(train_features_final)
train_features_final_df.to_csv('train_features_final.csv')
train_targets_final_df=pd.DataFrame(train_targets_final)
train_targets_final_df.to_csv('train_targets_final.csv')


In [2]:
#extract features for testing images
from sklearn.datasets import load_files     
data = load_files('test')
import numpy as np
from keras.utils import np_utils
test_files = np.array(data['filenames'])

from ipywidgets import IntProgress
from IPython.display import display
import time

p = IntProgress()
progressFull=len(test_files)
display(p)

test_features=[]
for test_file in test_files:
    features=extract_features(test_file)
    test_features.append(features)
    p.value = float(len(test_features))/float(progressFull)*100
    p.description = str(len(test_features))+'/'+str(progressFull)

IntProgress(value=0)

In [3]:
#store extracted features into csv files
import pandas as pd
test_features_df=pd.DataFrame(test_features)
test_features_df.to_csv('test_features.csv')