In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier 
from sklearn import svm
from sklearn.neural_network import MLPClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

import nltk
import xlsxwriter
import openpyxl
import os.path
from os.path import exists


In [2]:
filename = "train_augmented_reduced.csv"
df_data = pd.read_csv("../data/"+filename)
df_data.head()

Unnamed: 0.1,Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,...,wild boar,window screen,window shade,wing,wire-haired fox terrier,wok,wombat,wood rabbit,wooden spoon,zebra
0,0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,...,1.370405e-07,9.120131e-08,9.62041e-08,9.886659e-09,1.51683e-06,2.543714e-07,6.532888e-09,2.143564e-08,2.199132e-08,1.804311e-08
1,1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,...,7.019173e-06,0.004317565,0.009602808,0.0001617884,7.745354e-05,1.854264e-06,3.699807e-06,0.001004158,0.002429163,0.001647422
2,2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,...,1.531006e-08,1.205947e-07,1.660079e-09,1.358542e-09,6.075466e-09,1.431959e-09,4.368674e-08,1.527701e-08,4.852483e-10,8.205064e-10
3,3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,...,8.449782e-08,7.258372e-06,9.183127e-06,5.377976e-07,0.04001648,8.849357e-07,1.589311e-06,8.007253e-06,2.567087e-06,1.721762e-06
4,4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,...,1.501322e-07,5.324189e-06,2.671657e-07,1.05259e-07,1.369713e-07,4.25742e-09,1.959747e-06,9.865511e-07,8.348145e-09,7.60818e-07


In [3]:
def scale10(n):
    val = 0
    if n <=10: 
        val = 1
    elif n >10 and n<=20:
        val = 2
    elif n >20 and n <=30:
        val = 3
    elif n>30 and n <=40:
        val = 4
    elif n > 40 and n<=50:
        val = 5
    elif n>50 and n<=60:
        val = 6
    elif n>60 and n<=70:
        val = 7
    elif n>70 and n<=80:
        val = 8
    elif n>80 and n<=90:
        val = 9
    else: 
        val = 10
    return(val)

def scale5(n):
    val = 0
    if n <=20: 
        val = 1
    elif n >20 and n<=40:
        val = 2
    elif n >40 and n <=60:
        val = 3
    elif n>60 and n <=80:
        val = 4
    else: val = 5

    return(val)

def scale4(n):
    val = 0
    if n <=25: 
        val = 1
    elif n >25 and n<=50:
        val = 2
    elif n >50 and n <=75:
        val = 3
    else: 
        val = 4

    return(val)

def scale3(n):
    val = 0
    if n <=34: 
        val = 1
    elif n >34 and n<=68:
        val = 2
    else:
        val = 3

    return(val)

def scale2(n):
    val = 0
    if n <=50: 
        val = 1
    else:
        val = 2

    return(val)

In [4]:
#CREATE COLUMNS WITH DIFFERENT SCALES FOR TESTING MODELS
df_data["Scale10"] = df_data.apply(lambda row: scale10(row.Pawpularity), axis=1)
df_data["Scale5"] = df_data.apply(lambda row: scale5(row.Pawpularity), axis=1)
df_data["Scale4"] = df_data.apply(lambda row: scale4(row.Pawpularity), axis=1)
df_data["Scale3"] = df_data.apply(lambda row: scale3(row.Pawpularity), axis=1)
df_data["Scale2"] = df_data.apply(lambda row: scale2(row.Pawpularity), axis=1)

### Splitting Data

In [5]:
columnsL = list(df_data.columns)
columnsL

['Unnamed: 0',
 'Id',
 'Subject Focus',
 'Eyes',
 'Face',
 'Near',
 'Action',
 'Accessory',
 'Group',
 'Collage',
 'Human',
 'Occlusion',
 'Info',
 'Blur',
 'Pawpularity',
 'Afghan hound',
 'African chameleon',
 'African crocodile',
 'African grey',
 'African hunting dog',
 'Airedale',
 'American Staffordshire terrier',
 'American alligator',
 'American black bear',
 'American chameleon',
 'Angora',
 'Appenzeller',
 'Arctic fox',
 'Australian terrier',
 'Bedlington terrier',
 'Bernese mountain dog',
 'Blenheim spaniel',
 'Border collie',
 'Border terrier',
 'Boston bull',
 'Bouvier des Flandres',
 'Brabancon griffon',
 'Brittany spaniel',
 'Cardigan',
 'Chesapeake Bay retriever',
 'Chihuahua',
 'Christmas stocking',
 'Crock Pot',
 'Dandie Dinmont',
 'Doberman',
 'Dungeness crab',
 'Egyptian cat',
 'English foxhound',
 'English setter',
 'English springer',
 'EntleBucher',
 'Eskimo dog',
 'French bulldog',
 'German shepherd',
 'German short-haired pointer',
 'Gordon setter',
 'Granny Sm

In [6]:
cols_to_drop = ['Id','Pawpularity', 'Scale10','Scale5','Scale4','Scale3','Scale2', 'Unnamed: 0']
df_data_X = df_data.drop(labels=cols_to_drop, axis=1)
X_cols = list(df_data_X.columns)

In [7]:
#Get X and Y data - shuffle data.
X = np.array(df_data[X_cols])
Y = df_data['Pawpularity'].values[:]

id_image = df_data['Id'].values[:]

Y10 = df_data['Scale10'].values[:]
Y5 = df_data['Scale5'].values[:]
Y4 = df_data['Scale4'].values[:]
Y3 = df_data['Scale3'].values[:]
Y2 = df_data['Scale2'].values[:]


shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Y, id_image = X[shuffle], Y[shuffle], id_image[shuffle]
Y10, Y5, Y4, Y3, Y2 = Y10[shuffle], Y5[shuffle], Y4[shuffle], Y3[shuffle], Y2[shuffle]

In [8]:
# Define sizes for train, development and test data (0.5, 0.2, 0.3)
per_train = 0.5
per_dev = 0.2

num_images = len(Y)
train_size = int(round(num_images * per_train,0))
dev_size = int(round(num_images * per_dev,0))

In [9]:
# Split data based on defined sizes
test_data, test_labels, id_test = X[train_size+dev_size:], Y[train_size+dev_size:], id_image[train_size+dev_size:]
test_y10 = Y10[train_size+dev_size:]
test_y5 = Y5[train_size+dev_size:]
test_y4 = Y4[train_size+dev_size:]
test_y3 = Y3[train_size+dev_size:]
test_y2 = Y2[train_size+dev_size:]

dev_data, dev_labels, id_dev = X[train_size:train_size+dev_size], Y[train_size:train_size+dev_size], id_image[train_size:train_size+dev_size]
dev_y10 = Y10[train_size:train_size+dev_size]
dev_y5 = Y5[train_size:train_size+dev_size]
dev_y4 = Y4[train_size:train_size+dev_size]
dev_y3 = Y3[train_size:train_size+dev_size]
dev_y2 = Y2[train_size:train_size+dev_size]

train_data, train_labels, id_train = X[:train_size], Y[:train_size], id_image[:train_size]
train_y10 =  Y10[:train_size]
train_y5 =  Y5[:train_size]
train_y4 =  Y4[:train_size]
train_y3 =  Y3[:train_size]
train_y2 =  Y2[:train_size]

print(num_images)
print(train_data.shape, train_labels.shape, id_train.shape)
print(dev_data.shape, dev_labels.shape, id_dev.shape)
print(test_data.shape, test_labels.shape, id_test.shape)
print(test_y10.shape, dev_y10.shape, train_y10.shape)
print(test_y5.shape, dev_y5.shape, train_y5.shape)
print(test_y4.shape, dev_y4.shape, train_y4.shape)
print(test_y3.shape, dev_y3.shape, train_y3.shape)
print(test_y2.shape, dev_y2.shape, train_y2.shape)

9912
(4956, 472) (4956,) (4956,)
(1982, 472) (1982,) (1982,)
(2974, 472) (2974,) (2974,)
(2974,) (1982,) (4956,)
(2974,) (1982,) (4956,)
(2974,) (1982,) (4956,)
(2974,) (1982,) (4956,)
(2974,) (1982,) (4956,)


In [10]:
def wrt_excel(file, sheet_name, df):
    if os.path.exists(file):
        with pd.ExcelWriter(file, engine="openpyxl", mode='a') as writer:
            df.to_excel(writer, sheet_name=sheet_name)
    else:
        with pd.ExcelWriter(file, engine="openpyxl") as writer:
            df.to_excel(writer, sheet_name=sheet_name)