In [107]:
import pandas as pd
import numpy as np


np.random.seed(123)

# Load data
train = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/train.csv')
test = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/test.csv')

# Add file path
def get_image_file_path(dir, image_id):
    return f'/kaggle/input/petfinder-pawpularity-score/{dir}/{image_id}.jpg'

for ix, row in train.iterrows():
    train.at[ix,'file_path'] = get_image_file_path(dir='train', image_id=row.Id)
for ix, row in test.iterrows():
    test.at[ix,'file_path'] = get_image_file_path(dir='test', image_id=row.Id)

display(test.head())
display(train.head())

In [108]:
"""01 - naive baseline based on global mean"""
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(train['Id'], train['Pawpularity'], 
                                                    test_size=0.2, random_state=123)
global_mean = np.repeat(y_train.mean(), len(y_test))
rmse = math.sqrt(mean_squared_error(y_test, global_mean))
print(f"The mean squared error of the naive model on the validation set is {rmse:.2f}.")

global_mean = train['Pawpularity'].mean()

my_submission = pd.DataFrame({'Id': test.Id, 'Pawpularity': global_mean})
my_submission.to_csv('submission.csv', index=False)

In [80]:
"""02 - traditional baseline HOG features"""

from matplotlib import pyplot as plt
from skimage import color
from skimage.feature import hog
from skimage.transform import resize
import time
from sklearn.linear_model import LinearRegression
import math


# reshape images to 128x64 and grayscale
start = time.time()
train_gray = [color.rgb2gray(resize(plt.imread(row.file_path),(128,64))) for ix,row in train.iterrows()]
end = time.time()
print('time elapsed: {}'.format(end-start))

# compute hog features
hog_images = []
hog_features = []
for image in train_gray:
    fd, hog_image = hog(image, orientations=8, pixels_per_cell=(16,16),
                        cells_per_block=(4, 4),block_norm= 'L2',visualize=True)
    hog_images.append(hog_image)
    hog_features.append(fd)
    
# plt.imshow(hog_images[0])

# train linear regression model
hog_features = np.array(hog_features)
labels = np.array(train['Pawpularity']).reshape(len(train['Pawpularity']),1)
data_frame = np.hstack((hog_features,labels))
np.random.shuffle(data_frame)

percentage = 80
partition = int(len(hog_features)*percentage/100)
x_train, x_test = data_frame[:partition,:-1],  data_frame[partition:,:-1]
y_train, y_test = data_frame[:partition,-1:].ravel() , data_frame[partition:,-1:].ravel()

linear_regression = LinearRegression()
linear_regression.fit(x_train, y_train)

predictions = linear_regression.predict(x_test)
rmse = math.sqrt(mean_squared_error(y_test, predictions))
print(f"The mean squared error of the optimal model is {rmse:.2f}")