## Just a regression
- Data input/output pipeline
- Figure out how to submit a notebook

## Adding image properties
- Include images in pipeline
- File size as proxy for image quality

## Improve model
- Switch to a more powerful library (e.g. RAPIDS/cudf)

## Neural network image processing
- Cats vs dogs 
- Pytorch basic: image - NN -> score
- Computer vision feature extraction?
- Other?

### Other to-do
- Pre-split training data

### Misc ideas using **real** human brains!
- Portion of pet in picture
- Greenery
- Preferred pet colors/sizes

In [13]:
import pandas as pd # to use data 
import numpy as np
from sklearn.svm import SVR # switched to support vector regression
from sklearn import preprocessing

from sklearn.model_selection import KFold # Use kfolds cross validation to train model

import os # to get file attributes

In [3]:
# Modify the data directory - to upload, use the below (change this before you upload!)
data_dir = r'Q:\My Drive\General\Prize\Kaggle\pawpularity\\'

# data_dir = r'/kaggle/input/petfinder-pawpularity-score/'

df_train = pd.read_csv(data_dir + 'train.csv')
df_test = pd.read_csv(data_dir + 'test.csv')


In [4]:
# What the raw data looks like

df_train.head()


Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72


In [5]:
# Setting this up as a function in case we want to add other attributes


def add_attributes(dataset, dataframe):
    # Adding basic data attributes from images
    dataframe["file_size"] = os.path.getsize(
        f"{data_dir}/{dataset}/{dataframe['Id']}.jpg"
    )
    return dataframe


In [6]:
# Apply function to both train and test datasets

df_train = df_train.apply(lambda x: add_attributes("train", x), axis=1)
df_test = df_test.apply(lambda x: add_attributes("test", x), axis=1)


In [7]:
# Scaling new values

columns = ['file_size']

df_train[columns] = preprocessing.scale(df_train[columns])
df_test[columns] = preprocessing.scale(df_test[columns])


In [8]:
# Scaling Pawpularity to a 0 - 1 "view probability"

df_train['Pawpularity'] = df_train['Pawpularity']/100


In [9]:
# Take a look

df_train.head()


Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,file_size
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,0.63,-0.364017
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,0.42,-1.002745
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,0.28,-0.200984
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,0.15,-1.17709
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,0.72,0.291086


In [10]:
# Breaking out predictor and outcomes

X = df_train[
    [
        "Subject Focus",
        "Eyes",
        "Face",
        "Near",
        "Action",
        "Accessory",
        "Group",
        "Collage",
        "Human",
        "Occlusion",
        "Info",
        "Blur",
        "file_size",
    ]
]
y = df_train["Pawpularity"]


In [11]:
# Set up the kfolds

k = 5
kf = KFold(n_splits=k, random_state=None)


In [15]:
# Train the SVR model with kfolds cross validation

acc_score = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]

    regr = SVR(kernel="rbf")
    regr.fit(X_train, y_train)
    pred_values = regr.predict(X_test)

    acc = np.sqrt(np.mean((pred_values * 100 - y_test * 100) ** 2.0))
    acc_score.append(acc)

    print(f"Fold accuracy: {acc}")
avg_acc_score = sum(acc_score) / k
print(f"Average accuracy : {avg_acc_score}")


Fold accuracy: 20.918991268485264
Fold accuracy: 21.093043081962836
Fold accuracy: 21.15387970727196
Fold accuracy: 20.3564080728446
Fold accuracy: 20.670875892223584
Average accuracy : 20.838639604557653


## Create the submission .csv

In [14]:
# Predict on the test dataset

pawpularity = regr.predict(df_test[['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory',
       'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur','file_size']])


In [15]:
# Select the ID

id = df_test.Id.values


In [17]:
# Create the final dataframe

data = {'Id': id, 'Pawpularity': pawpularity*100}
submission = pd.DataFrame(data,columns=['Id','Pawpularity'])

submission.to_csv('submission.csv', index = False)


In [18]:
# Curious?

submission.head()


Unnamed: 0,Id,Pawpularity
0,4128bae22183829d2b5fea10effdb0c3,29.975769
1,43a2262d7738e3d420d453815151079e,32.734469
2,4e429cead1848a298432a0acad014c9d,37.402044
3,80bc3ccafcc51b66303c2c263aa38486,25.4816
4,8f49844c382931444e68dffbe20228f4,21.684851
