In [1]:
import numpy as np
import pandas as pd
import random
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [2]:
def load_numpy_data(file_path):
    data = np.load(file_path, encoding='latin1')
    np_data = data[:,1:]
    df = pd.DataFrame(np_data)
    return pd.DataFrame(df)

def load_csv_data(file_path):
    csv_data = pd.read_csv(file_path)
    return csv_data

def format_images(image_data):
    num_cols_img = image_data[0,0].shape[0]
    num_rows_data = image_data.shape[0]
    result = np.zeros((num_rows_data, num_cols_img))
    for i in range(num_rows_data):
        for j in range(num_cols_img):
            result[i,j] = image_data[i,0][j]
    return pd.DataFrame(result)

def create_class_num_maps(categories):
    num_dict = {} # key : int, value : String
    class_dict = {} # key : String, value : int
    for i, cl in enumerate(categories):
        num_dict[i] = cl
        class_dict[cl] = i
    return num_dict, class_dict

def convert_y_to_num(y, class_dict):
    y_num = np.zeros(y.shape[0])
    for i, cl in enumerate(y):
        y_num[i] = class_dict[cl]
    return pd.DataFrame(y_num)

# makes random splits in dataset, labels for train and valid sets
# Example: X_train, y_train, X_valid, y_valid = train_valid_split(train_images.values, train_labels.values, 0.8, 0.2)
def train_valid_split(dataset, labels, train_split, valid_split):
    if (train_split + valid_split) != 1:
        raise ValueError("invalid size for train_split, valid_split")
    num_rows = dataset.shape[0]
    num_cols = dataset.shape[1]
    train = list()
    valid = list()
    dataset_copy = list(dataset)
    
    y_train = list()
    y_valid = list()
    labels_copy = list(labels)
    
    train_size = train_split*num_rows
    valid_size = valid_split*num_rows
    
    while len(train) < train_size:
        index = random.randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
        y_train.append(labels_copy.pop(index))
    
    while len(valid) < valid_size:
        index = random.randrange(len(dataset_copy))
        valid.append(dataset_copy.pop(index))
        y_valid.append(labels_copy.pop(index))
        
    return pd.DataFrame(np.array(train)), pd.DataFrame(np.array(y_train)),pd.DataFrame(np.array(valid)),pd.DataFrame(np.array(y_valid))
     
    

In [3]:
train_images_RAW = load_numpy_data('all/train_images.npy')
train_labels_RAW = load_csv_data('all/train_labels.csv')

test_images_RAW = load_numpy_data('all/test_images.npy')
sample_submission = load_csv_data('all/sample_submission.csv')

In [4]:
# train_images is final training set
train_images = format_images(train_images_RAW.values)
train_labels_txt = train_labels_RAW['Category']

# test_images is final testing set
test_images = format_images(test_images_RAW.values)

In [5]:
all_classes = ['sink', 'pear','moustache','nose','skateboard', 'penguin','peanut','skull','panda','paintbrush',
'nail','apple','rifle','mug', 'sailboat','pineapple','spoon','rabbit','shovel','rollerskates','screwdriver','scorpion',
'rhinoceros','pool','octagon','pillow','parrot','squiggle','mouth','empty','pencil']

'''
num_dict :
        key:int
        value:String
        
class_dict :
        key:String
        value:int
'''
num_dict, class_dict = create_class_num_maps(all_classes)

# train_labels is number representation of train categories
train_labels = convert_y_to_num(train_labels_txt.values, class_dict)

In [6]:
X_train, y_train, X_valid, y_valid = train_valid_split(train_images.values, train_labels.values, 0.8, 0.2)

In [7]:
clf = RandomForestClassifier(n_estimators=100, max_depth=100, random_state=0)
clf.fit(X_train.values, y_train.values.ravel())
y_valid_pred = clf.predict(X_valid.values)
score = f1_score(y_valid, y_valid_pred, average = 'micro')
print('RandomForestClassifier:\n\tf1_score (y_valid): ' + str(score))

RandomForestClassifier:
	f1_score (y_valid): 0.048
