In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive/


In [2]:
!ls "/content/drive/My Drive/Colab Notebooks/Adopt a Buddy/"

 clr_callback.py		 'Pet Adoption (CB) - PC.ipynb'
 Dataset			 'Pet Adoption (EDA).ipynb'
 learningratefinder.py		 'Pet Adoption (LGB) - BC.ipynb'
'NPZ Files'			 'Pet Adoption (LGB) - PC.ipynb'
'Pet Adoption (ANN) - BC.ipynb'   Predictions
'Pet Adoption (ANN) - PC.ipynb'   __pycache__
'Pet Adoption (CB) - BC.ipynb'


## Download Packages

In [3]:
! pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/06/b0/9a6313c78bca92abfacc08a2ad8b27bfe845256f615786ee2b6452ae1978/optuna-2.0.0.tar.gz (226kB)
[K     |████████████████████████████████| 235kB 2.7MB/s 
[?25hCollecting alembic
[?25l  Downloading https://files.pythonhosted.org/packages/60/1e/cabc75a189de0fbb2841d0975243e59bde8b7822bacbb95008ac6fe9ad47/alembic-1.4.2.tar.gz (1.1MB)
[K     |████████████████████████████████| 1.1MB 8.6MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting cliff
[?25l  Downloading https://files.pythonhosted.org/packages/71/06/03b1f92d46546a18eabf33ff7f37ef422c18c93d5a926bf590fee32ebe75/cliff-3.4.0-py3-none-any.whl (76kB)
[K     |████████████████████████████████| 81kB 7.4MB/s 
[?25hCollecting cmaes>=0.5.1
  Downloading https://files.pythonhosted.org/packages/63/88/d5e9b78151dce671d7e78ee4cc8905d8320

## Import Packages

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.decomposition import PCA
import featuretools as ft
import lightgbm as lgb
import optuna



## Set file paths for train and predict datasets

In [10]:
train_dataset = "/content/drive/My Drive/Colab Notebooks/Adopt a Buddy/Dataset/train.csv"
predict_dataset = "/content/drive/My Drive/Colab Notebooks/Adopt a Buddy/Dataset/test.csv"

## Data Preprocessing

### Read train and predict datasets

In [11]:
train_df = pd.read_csv(train_dataset)
predict_df = pd.read_csv(predict_dataset)
print("train_df: {}".format(train_df.shape))
print("predict_df: {}".format(predict_df.shape))

train_df: (18834, 11)
predict_df: (8072, 9)


### Extract target variables into NumPy arrays

In [12]:
train_df['pet_category'] = train_df['pet_category'].apply(lambda x: 3 if x==4 else x)
train_y_pc = np.array([train_df['pet_category'].values]).T
train_df.drop(['breed_category','pet_category'], inplace=True, axis=1)
print("train_y_pc: {}".format(train_y_pc.shape))

train_y_pc: (18834, 1)


In [13]:
temp_df = pd.DataFrame(train_y_pc, columns=['Class'])
temp_df.groupby(['Class']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,Class,count
0,0,88
1,1,7184
2,2,10621
3,3,941


### Combine train and predict dataframes

In [14]:
combined_df = train_df.append(predict_df, sort=False, ignore_index=True)
combined_df.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4


### Feature Engineering

#### Issue Date and Listing Date

In [15]:
combined_df['iss_dt_year'] = combined_df['issue_date'].map(lambda x: pd.to_datetime(x).year)
combined_df['iss_dt_quarter'] = combined_df['issue_date'].map(lambda x: pd.to_datetime(x).quarter)
combined_df['iss_dt_month'] = combined_df['issue_date'].map(lambda x: pd.to_datetime(x).month)
combined_df['iss_dt_week'] = combined_df['issue_date'].map(lambda x: pd.to_datetime(x).week)
combined_df['iss_dt_day_year'] = combined_df['issue_date'].map(lambda x: pd.to_datetime(x).dayofyear)
combined_df['iss_dt_day_month'] = combined_df['issue_date'].map(lambda x: pd.to_datetime(x).day)
combined_df['iss_dt_day_week'] = combined_df['issue_date'].map(lambda x: pd.to_datetime(x).dayofweek)
combined_df['iss_dt_day_weekend'] = np.where(combined_df['iss_dt_day_week'].isin([5,6]),1,0)

In [16]:
combined_df['lst_dt_year'] = combined_df['listing_date'].map(lambda x: pd.to_datetime(x).year)
combined_df['lst_dt_quarter'] = combined_df['listing_date'].map(lambda x: pd.to_datetime(x).quarter)
combined_df['lst_dt_month'] = combined_df['listing_date'].map(lambda x: pd.to_datetime(x).month)
combined_df['lst_dt_week'] = combined_df['listing_date'].map(lambda x: pd.to_datetime(x).week)
combined_df['lst_dt_day_year'] = combined_df['listing_date'].map(lambda x: pd.to_datetime(x).dayofyear)
combined_df['lst_dt_day_month'] = combined_df['listing_date'].map(lambda x: pd.to_datetime(x).day)
combined_df['lst_dt_day_week'] = combined_df['listing_date'].map(lambda x: pd.to_datetime(x).dayofweek)
combined_df['lst_dt_day_weekend'] = np.where(combined_df['lst_dt_day_week'].isin([5,6]),1,0)
combined_df['lst_dt_hour'] = combined_df['listing_date'].map(lambda x: pd.to_datetime(x).hour)
combined_df['lst_dt_minute'] = combined_df['listing_date'].map(lambda x: pd.to_datetime(x).minute)
combined_df['lst_dt_minutes_elapsed'] = combined_df['listing_date'].map(lambda x: (pd.to_datetime(x) - pd.to_datetime(pd.to_datetime(x).date())).seconds / 60.0)

In [17]:
combined_df['dt_diff'] = (pd.to_datetime(combined_df['listing_date']) - pd.to_datetime(combined_df['issue_date']))
combined_df['dt_diff_days'] = combined_df['dt_diff']/np.timedelta64(1,'D')
combined_df['dt_diff_weeks'] = combined_df['dt_diff']/np.timedelta64(1,'W')
combined_df['dt_diff_mnths'] = combined_df['dt_diff']/np.timedelta64(1,'M')
combined_df['dt_diff_yrs'] = combined_df['dt_diff']/np.timedelta64(1,'Y')
combined_df['dt_diff_sec'] = combined_df['dt_diff']/np.timedelta64(1,'s')
combined_df['dt_diff_min'] = combined_df['dt_diff']/np.timedelta64(1,'m')
combined_df['dt_diff_hrs'] = combined_df['dt_diff']/np.timedelta64(1,'h')

In [18]:
combined_df['curr_iss_dt_diff'] = (pd.datetime.now() - pd.to_datetime(combined_df['issue_date']))
combined_df['curr_iss_dt_diff_mnths'] = combined_df['curr_iss_dt_diff']/np.timedelta64(1,'M')
combined_df['curr_iss_dt_diff_yrs'] = combined_df['curr_iss_dt_diff']/np.timedelta64(1,'Y')

combined_df['curr_lst_dt_diff'] = (pd.datetime.now() - pd.to_datetime(combined_df['listing_date']))
combined_df['curr_lst_dt_diff_mnths'] = combined_df['curr_lst_dt_diff']/np.timedelta64(1,'M')
combined_df['curr_lst_dt_diff_yrs'] = combined_df['curr_lst_dt_diff']/np.timedelta64(1,'Y')


The pandas.datetime class is deprecated and will be removed from pandas in a future version. Import from datetime instead.


The pandas.datetime class is deprecated and will be removed from pandas in a future version. Import from datetime instead.



In [19]:
combined_df.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,iss_dt_year,iss_dt_quarter,iss_dt_month,iss_dt_week,iss_dt_day_year,iss_dt_day_month,iss_dt_day_week,iss_dt_day_weekend,lst_dt_year,lst_dt_quarter,lst_dt_month,lst_dt_week,lst_dt_day_year,lst_dt_day_month,lst_dt_day_week,lst_dt_day_weekend,lst_dt_hour,lst_dt_minute,lst_dt_minutes_elapsed,dt_diff,dt_diff_days,dt_diff_weeks,dt_diff_mnths,dt_diff_yrs,dt_diff_sec,dt_diff_min,dt_diff_hrs,curr_iss_dt_diff,curr_iss_dt_diff_mnths,curr_iss_dt_diff_yrs,curr_lst_dt_diff,curr_lst_dt_diff_mnths,curr_lst_dt_diff_yrs
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,2016,3,7,27,192,10,6,1,2016,3,9,38,265,21,2,0,16,25,985.0,73 days 16:25:00,73.684028,10.52629,2.42088,0.20174,6366300.0,106105.0,1768.416667,1504 days 11:35:37.468346,49.429617,4.119135,1430 days 19:10:37.486294,47.008737,3.917395
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,2013,4,11,47,325,21,3,0,2018,4,12,52,361,27,3,0,17,47,1067.0,1862 days 17:47:00,1862.740972,266.105853,61.200139,5.100012,160940820.0,2682347.0,44705.783333,2466 days 11:35:37.468346,81.036015,6.753001,603 days 17:48:37.486294,19.835877,1.65299
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4,2014,3,9,39,271,28,6,1,2016,4,10,42,293,19,2,0,8,24,504.0,752 days 08:24:00,752.35,107.478571,24.718372,2.059864,65003040.0,1083384.0,18056.4,2155 days 11:35:37.468346,70.818146,5.901512,1403 days 03:11:37.486294,46.099774,3.841648
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,2016,4,12,52,366,31,5,1,2019,1,1,4,25,25,4,0,18,30,1110.0,755 days 18:30:00,755.770833,107.967262,24.830763,2.06923,65298600.0,1088310.0,18138.5,1330 days 11:35:37.468346,43.712867,3.642739,574 days 17:05:37.486294,18.882104,1.573509
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,2017,3,9,39,271,28,3,0,2017,4,11,46,323,19,6,1,9,38,578.0,52 days 09:38:00,52.401389,7.485913,1.721642,0.14347,4527480.0,75458.0,1257.633333,1059 days 11:35:37.468346,34.809194,2.900766,1007 days 01:57:37.486294,33.087552,2.757296


#### Condition

In [20]:
combined_df['condition'] = combined_df['condition'].fillna(value=3)

In [21]:
dummy_val = pd.get_dummies(combined_df['condition'], prefix='condition')
combined_df = pd.concat([combined_df, dummy_val], axis=1)

In [22]:
combined_df['condition_odd_even'] = combined_df['condition'].apply(lambda x: 'EVEN' if x%2==0 else 'ODD')
dummy_val = pd.get_dummies(combined_df['condition_odd_even'], prefix='condition_odd_even')
combined_df = pd.concat([combined_df, dummy_val], axis=1)

combined_df['condition*X1'] = np.cbrt(combined_df['condition'] * combined_df['X1'])
combined_df['condition*X2'] = combined_df['condition'] * combined_df['X2']

In [23]:
combined_df['condition%X1'] = combined_df.apply(lambda row: 0 if row['condition']==0 else row['X1']%row['condition'], axis=1)
combined_df['condition%X2'] = combined_df.apply(lambda row: 0 if row['condition']==0 else row['X2']%row['condition'], axis=1)

#### Pet Id

In [24]:
combined_df['pet_id_num'] = combined_df['pet_id'].apply(lambda x: x.split('_')[1]).astype(int)
combined_df.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,iss_dt_year,iss_dt_quarter,iss_dt_month,iss_dt_week,iss_dt_day_year,iss_dt_day_month,iss_dt_day_week,iss_dt_day_weekend,lst_dt_year,lst_dt_quarter,lst_dt_month,lst_dt_week,lst_dt_day_year,lst_dt_day_month,lst_dt_day_week,lst_dt_day_weekend,lst_dt_hour,lst_dt_minute,lst_dt_minutes_elapsed,dt_diff,dt_diff_days,dt_diff_weeks,dt_diff_mnths,dt_diff_yrs,dt_diff_sec,dt_diff_min,dt_diff_hrs,curr_iss_dt_diff,curr_iss_dt_diff_mnths,curr_iss_dt_diff_yrs,curr_lst_dt_diff,curr_lst_dt_diff_mnths,curr_lst_dt_diff_yrs,condition_0.0,condition_1.0,condition_2.0,condition_3.0,condition_odd_even,condition_odd_even_EVEN,condition_odd_even_ODD,condition*X1,condition*X2,condition%X1,condition%X2,pet_id_num
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,2016,3,7,27,192,10,6,1,2016,3,9,38,265,21,2,0,16,25,985.0,73 days 16:25:00,73.684028,10.52629,2.42088,0.20174,6366300.0,106105.0,1768.416667,1504 days 11:35:37.468346,49.429617,4.119135,1430 days 19:10:37.486294,47.008737,3.917395,0,0,1,0,EVEN,1,0,2.962496,18.0,1.0,1.0,69903
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,2013,4,11,47,325,21,3,0,2018,4,12,52,361,27,3,0,17,47,1067.0,1862 days 17:47:00,1862.740972,266.105853,61.200139,5.100012,160940820.0,2682347.0,44705.783333,2466 days 11:35:37.468346,81.036015,6.753001,603 days 17:48:37.486294,19.835877,1.65299,0,1,0,0,ODD,0,1,2.351335,9.0,0.0,0.0,66892
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,3.0,Brown,0.15,40.9,15,4,2014,3,9,39,271,28,6,1,2016,4,10,42,293,19,2,0,8,24,504.0,752 days 08:24:00,752.35,107.478571,24.718372,2.059864,65003040.0,1083384.0,18056.4,2155 days 11:35:37.468346,70.818146,5.901512,1403 days 03:11:37.486294,46.099774,3.841648,0,0,0,1,ODD,0,1,3.556893,12.0,0.0,1.0,69750
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,2016,4,12,52,366,31,5,1,2019,1,1,4,25,25,4,0,18,30,1110.0,755 days 18:30:00,755.770833,107.967262,24.830763,2.06923,65298600.0,1088310.0,18138.5,1330 days 11:35:37.468346,43.712867,3.642739,574 days 17:05:37.486294,18.882104,1.573509,0,1,0,0,ODD,0,1,0.0,1.0,0.0,0.0,71623
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,2017,3,9,39,271,28,3,0,2017,4,11,46,323,19,6,1,9,38,578.0,52 days 09:38:00,52.401389,7.485913,1.721642,0.14347,4527480.0,75458.0,1257.633333,1059 days 11:35:37.468346,34.809194,2.900766,1007 days 01:57:37.486294,33.087552,2.757296,0,0,1,0,EVEN,1,0,3.301927,8.0,0.0,0.0,57969


In [25]:
combined_df['pet_id_odd'] = combined_df['pet_id_num'].apply(lambda x: 0 if x%2==0 else 1)
combined_df['pet_id_even'] = combined_df['pet_id_num'].apply(lambda x: 1 if x%2==0 else 0)
combined_df['petid%X1'] = combined_df.apply(lambda row: 0 if row['X1']==0 else row['pet_id_num']%row['X1'], axis=1)
combined_df['petid%X2'] = combined_df.apply(lambda row: 0 if row['X2']==0 else row['pet_id_num']%row['X2'], axis=1)
combined_df['petid%condition'] = combined_df.apply(lambda row: 0 if row['condition']==0 else row['pet_id_num']%row['condition'], axis=1)

In [26]:
combined_df['pet_id_0'] = combined_df['pet_id_num'].apply(lambda x: int(str(x)[0]))
combined_df['pet_id_1'] = combined_df['pet_id_num'].apply(lambda x: int(str(x)[1]))
combined_df['pet_id_2'] = combined_df['pet_id_num'].apply(lambda x: int(str(x)[2]))
combined_df['pet_id_3'] = combined_df['pet_id_num'].apply(lambda x: int(str(x)[3]))
combined_df['pet_id_4'] = combined_df['pet_id_num'].apply(lambda x: int(str(x)[4]))

In [27]:
combined_df.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,iss_dt_year,iss_dt_quarter,iss_dt_month,iss_dt_week,iss_dt_day_year,iss_dt_day_month,iss_dt_day_week,iss_dt_day_weekend,lst_dt_year,lst_dt_quarter,lst_dt_month,lst_dt_week,lst_dt_day_year,lst_dt_day_month,lst_dt_day_week,lst_dt_day_weekend,lst_dt_hour,lst_dt_minute,lst_dt_minutes_elapsed,dt_diff,dt_diff_days,dt_diff_weeks,dt_diff_mnths,dt_diff_yrs,dt_diff_sec,dt_diff_min,dt_diff_hrs,curr_iss_dt_diff,curr_iss_dt_diff_mnths,curr_iss_dt_diff_yrs,curr_lst_dt_diff,curr_lst_dt_diff_mnths,curr_lst_dt_diff_yrs,condition_0.0,condition_1.0,condition_2.0,condition_3.0,condition_odd_even,condition_odd_even_EVEN,condition_odd_even_ODD,condition*X1,condition*X2,condition%X1,condition%X2,pet_id_num,pet_id_odd,pet_id_even,petid%X1,petid%X2,petid%condition,pet_id_0,pet_id_1,pet_id_2,pet_id_3,pet_id_4
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,2016,3,7,27,192,10,6,1,2016,3,9,38,265,21,2,0,16,25,985.0,73 days 16:25:00,73.684028,10.52629,2.42088,0.20174,6366300.0,106105.0,1768.416667,1504 days 11:35:37.468346,49.429617,4.119135,1430 days 19:10:37.486294,47.008737,3.917395,0,0,1,0,EVEN,1,0,2.962496,18.0,1.0,1.0,69903,1,0,2,0,1.0,6,9,9,0,3
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,2013,4,11,47,325,21,3,0,2018,4,12,52,361,27,3,0,17,47,1067.0,1862 days 17:47:00,1862.740972,266.105853,61.200139,5.100012,160940820.0,2682347.0,44705.783333,2466 days 11:35:37.468346,81.036015,6.753001,603 days 17:48:37.486294,19.835877,1.65299,0,1,0,0,ODD,0,1,2.351335,9.0,0.0,0.0,66892,0,1,7,4,0.0,6,6,8,9,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,3.0,Brown,0.15,40.9,15,4,2014,3,9,39,271,28,6,1,2016,4,10,42,293,19,2,0,8,24,504.0,752 days 08:24:00,752.35,107.478571,24.718372,2.059864,65003040.0,1083384.0,18056.4,2155 days 11:35:37.468346,70.818146,5.901512,1403 days 03:11:37.486294,46.099774,3.841648,0,0,0,1,ODD,0,1,3.556893,12.0,0.0,1.0,69750,0,1,0,2,0.0,6,9,7,5,0
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,2016,4,12,52,366,31,5,1,2019,1,1,4,25,25,4,0,18,30,1110.0,755 days 18:30:00,755.770833,107.967262,24.830763,2.06923,65298600.0,1088310.0,18138.5,1330 days 11:35:37.468346,43.712867,3.642739,574 days 17:05:37.486294,18.882104,1.573509,0,1,0,0,ODD,0,1,0.0,1.0,0.0,0.0,71623,1,0,0,0,0.0,7,1,6,2,3
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,2017,3,9,39,271,28,3,0,2017,4,11,46,323,19,6,1,9,38,578.0,52 days 09:38:00,52.401389,7.485913,1.721642,0.14347,4527480.0,75458.0,1257.633333,1059 days 11:35:37.468346,34.809194,2.900766,1007 days 01:57:37.486294,33.087552,2.757296,0,0,1,0,EVEN,1,0,3.301927,8.0,0.0,0.0,57969,1,0,9,1,1.0,5,7,9,6,9


#### Length and Height

In [28]:
combined_df['height(m)'] = combined_df['height(cm)']/100

In [29]:
combined_df['length_scale_10'] = np.array(np.round((combined_df['length(m)'] * 10)), dtype='int')
combined_df['length_scale_100'] = np.array(np.round((combined_df['length(m)'] * 100)), dtype='int')

In [30]:
combined_df['height_scale_10'] = np.array(np.round((combined_df['height(m)'] * 10)), dtype='int')
combined_df['height_scale_100'] = np.array(np.round((combined_df['height(m)'] * 100)), dtype='int')
combined_df['height_scale_1000'] = np.array(np.round((combined_df['height(m)'] * 1000)), dtype='int')

In [31]:
combined_df['diag(cm)'] = np.sqrt(combined_df['length(m)']**2 + combined_df['height(m)']**2)*100
combined_df['cbrt(m)'] = np.cbrt(combined_df['length(m)']**2 + combined_df['height(m)']**2)
combined_df['area(m)'] = np.sqrt(combined_df['length(m)'] * combined_df['height(m)'])

In [32]:
combined_df['l=h'] = combined_df.apply(lambda row: 1 if np.round(row['length(m)'],1)==np.round(row['height(m)'],1) else 0, axis=1)
combined_df['l>h'] = combined_df.apply(lambda row: 1 if np.round(row['length(m)'],1)>np.round(row['height(m)'],1) else 0, axis=1)
combined_df['l<h'] = combined_df.apply(lambda row: 1 if np.round(row['length(m)'],1)<np.round(row['height(m)'],1) else 0, axis=1)

In [33]:
combined_df['pet_size'] = combined_df['diag(cm)'].apply(lambda x: 'SMALL' if x<=40 else 'MEDIUM' if x>40 and x<=80 else 'BIG')
dummy_val = pd.get_dummies(combined_df['pet_size'], prefix='pet_size')
combined_df = pd.concat([combined_df, dummy_val], axis=1)

#### Color Type

In [34]:
combined_df['subcolor_type1'] = combined_df['color_type'].apply(lambda x: x.split()[0])
dummy_val = pd.get_dummies(combined_df['subcolor_type1'], prefix='subcolor_type1')
combined_df = pd.concat([combined_df, dummy_val], axis=1)

In [35]:
combined_df['subcolor_type2'] = combined_df['color_type'].apply(lambda x: 'NONE' if len(x.split())==1 else x.split()[1])
dummy_val = pd.get_dummies(combined_df['subcolor_type2'], prefix='subcolor_type2')
combined_df = pd.concat([combined_df, dummy_val], axis=1)

In [36]:
combined_df["color_type_num_words"] = combined_df["color_type"].apply(lambda x: len(str(x).split()))
combined_df["color_type_num_chars"] = combined_df["color_type"].apply(lambda x: len(str(x)))

In [37]:
tfidf = TfidfVectorizer(analyzer='char')
features = tfidf.fit_transform(combined_df.color_type).toarray()
features_df = pd.DataFrame(features, columns=tfidf.get_feature_names())
print(features_df.shape)
combined_df = pd.merge(combined_df, features_df, left_index=True, right_index=True)

(26906, 24)


In [38]:
countvec = CountVectorizer(analyzer='char')
features = countvec.fit_transform(combined_df.color_type).toarray()
features_df = pd.DataFrame(features, columns=countvec.get_feature_names())
print(features_df.shape)
combined_df = pd.merge(combined_df, features_df, left_index=True, right_index=True)

(26906, 24)


#### X1 and X2

In [39]:
dummy_val = pd.get_dummies(combined_df['X1'], prefix='X1')
combined_df = pd.concat([combined_df, dummy_val], axis=1)

dummy_val = pd.get_dummies(combined_df['X2'], prefix='X2')
combined_df = pd.concat([combined_df, dummy_val], axis=1)

In [40]:
combined_df['X1_odd_even'] = combined_df['X1'].apply(lambda x: 'EVEN' if x%2==0 else 'ODD')
dummy_val = pd.get_dummies(combined_df['X1_odd_even'], prefix='X1_odd_even')
combined_df = pd.concat([combined_df, dummy_val], axis=1)

combined_df['X2_odd_even'] = combined_df['X2'].apply(lambda x: 'EVEN' if x%2==0 else 'ODD')
dummy_val = pd.get_dummies(combined_df['X2_odd_even'], prefix='X2_odd_even')
combined_df = pd.concat([combined_df, dummy_val], axis=1)

In [41]:
combined_df['Xavg'] = (combined_df['X1']+combined_df['X2'])/2
combined_df['Xdiff'] = np.abs(combined_df['X1']-combined_df['X2'])
combined_df['Xmul'] = combined_df['X1']*combined_df['X2']
combined_df['Xdiv'] = combined_df.apply(lambda row: 0 if row['X1']==0 or row['X2']==0 else row['X2']/row['X1'], axis=1)
combined_df['Xdiv'] = np.cbrt(combined_df['Xdiv'])
combined_df['Xmod'] = combined_df.apply(lambda row: 0 if row['X1']==0 or row['X2']==0 else row['X1']/row['X2'], axis=1)
combined_df['Xdiag'] = np.sqrt(combined_df['X1']**2 + combined_df['X2']**2)
combined_df['Xcbrt'] = np.cbrt(combined_df['X1']**2 + combined_df['X2']**2)

In [42]:
combined_df['X1_cat'] = combined_df['X1'].apply(lambda x: '<=6' if x<=6 else '6to13' if x>6 and x<=13 else '>13')
dummy_val = pd.get_dummies(combined_df['X1_cat'], prefix='X1_cat')
combined_df = pd.concat([combined_df, dummy_val], axis=1)

combined_df['X2_cat'] = combined_df['X2'].apply(lambda x: '<=3' if x<=3 else '4to7' if x>3 and x<=7 else '>7')
dummy_val = pd.get_dummies(combined_df['X2_cat'], prefix='X2_cat')
combined_df = pd.concat([combined_df, dummy_val], axis=1)

In [43]:
combined_df['X1=X2'] = combined_df.apply(lambda row: 1 if row['X1']==row['X2'] else 0, axis=1)
combined_df['X1>X2'] = combined_df.apply(lambda row: 1 if row['X1']>row['X2'] else 0, axis=1)
combined_df['X1<X2'] = combined_df.apply(lambda row: 1 if row['X1']<row['X2'] else 0, axis=1)

In [44]:
#combined_df['length/X1'] = combined_df.apply(lambda row: 0 if row['length(m)']==0 or row['X1']==0 else row['length(m)']/row['X1'], axis=1)
combined_df['length/X2'] = combined_df.apply(lambda row: 0 if row['length(m)']==0 or row['X2']==0 else row['length(m)']/row['X2'], axis=1)
combined_df['length/Xavg'] = combined_df.apply(lambda row: 0 if row['length(m)']==0 or row['Xavg']==0 else np.log1p(row['length(m)']/row['Xavg']), axis=1)
combined_df['length/Xdiag'] = combined_df.apply(lambda row: 0 if row['length(m)']==0 or row['Xdiag']==0 else np.log1p(row['length(m)']/row['Xdiag']), axis=1)

In [45]:
#combined_df['height/X1'] = combined_df.apply(lambda row: 0 if row['height(m)']==0 or row['X1']==0 else row['height(m)']/row['X1'], axis=1)
combined_df['height/X2'] = combined_df.apply(lambda row: 0 if row['height(m)']==0 or row['X2']==0 else row['height(m)']/row['X2'], axis=1)
combined_df['height/Xavg'] = combined_df.apply(lambda row: 0 if row['height(m)']==0 or row['Xavg']==0 else np.log1p(row['height(m)']/row['Xavg']), axis=1)
combined_df['height/Xdiag'] = combined_df.apply(lambda row: 0 if row['height(m)']==0 or row['Xdiag']==0 else np.log1p(row['height(m)']/row['Xdiag']), axis=1)

In [46]:
#combined_df['diag/X1'] = combined_df.apply(lambda row: 0 if row['diag(cm)']==0 or row['X1']==0 else row['diag(cm)']/row['X1'], axis=1)
combined_df['diag/X2'] = combined_df.apply(lambda row: 0 if row['diag(cm)']==0 or row['X2']==0 else row['diag(cm)']/row['X2'], axis=1)
combined_df['diag/Xavg'] = combined_df.apply(lambda row: 0 if row['diag(cm)']==0 or row['Xavg']==0 else np.log1p(row['diag(cm)']/row['Xavg']), axis=1)
combined_df['diag/Xdiag'] = combined_df.apply(lambda row: 0 if row['diag(cm)']==0 or row['Xdiag']==0 else np.log1p(row['diag(cm)']/row['Xdiag']), axis=1)

In [47]:
#combined_df['area/X1'] = combined_df.apply(lambda row: 0 if row['area(m)']==0 or row['X1']==0 else row['area(m)']/row['X1'], axis=1)
combined_df['area/X2'] = combined_df.apply(lambda row: 0 if row['area(m)']==0 or row['X2']==0 else row['area(m)']/row['X2'], axis=1)
combined_df['area/Xavg'] = combined_df.apply(lambda row: 0 if row['area(m)']==0 or row['Xavg']==0 else np.log1p(row['area(m)']/row['Xavg']), axis=1)
combined_df['area/Xdiag'] = combined_df.apply(lambda row: 0 if row['area(m)']==0 or row['Xdiag']==0 else np.log1p(row['area(m)']/row['Xdiag']), axis=1)

#### Drop redundant fields

In [48]:
combined_df.drop(['condition_odd_even','color_type', 'height(cm)', 'issue_date', 'listing_date', 'dt_diff', 'X1_cat', 'X2_cat', 'X1_odd_even', 
                  'X2_odd_even', 'pet_size', 'subcolor_type1', 'subcolor_type2', 'curr_iss_dt_diff', 'curr_lst_dt_diff'], axis=1, inplace=True)
combined_df.head()

Unnamed: 0,pet_id,condition,length(m),X1,X2,iss_dt_year,iss_dt_quarter,iss_dt_month,iss_dt_week,iss_dt_day_year,iss_dt_day_month,iss_dt_day_week,iss_dt_day_weekend,lst_dt_year,lst_dt_quarter,lst_dt_month,lst_dt_week,lst_dt_day_year,lst_dt_day_month,lst_dt_day_week,lst_dt_day_weekend,lst_dt_hour,lst_dt_minute,lst_dt_minutes_elapsed,dt_diff_days,dt_diff_weeks,dt_diff_mnths,dt_diff_yrs,dt_diff_sec,dt_diff_min,dt_diff_hrs,curr_iss_dt_diff_mnths,curr_iss_dt_diff_yrs,curr_lst_dt_diff_mnths,curr_lst_dt_diff_yrs,condition_0.0,condition_1.0,condition_2.0,condition_3.0,condition_odd_even_EVEN,...,X2_2,X2_3,X2_4,X2_5,X2_6,X2_7,X2_8,X2_9,X1_odd_even_EVEN,X1_odd_even_ODD,X2_odd_even_EVEN,X2_odd_even_ODD,Xavg,Xdiff,Xmul,Xdiv,Xmod,Xdiag,Xcbrt,X1_cat_6to13,X1_cat_<=6,X1_cat_>13,X2_cat_4to7,X2_cat_<=3,X2_cat_>7,X1=X2,X1>X2,X1<X2,length/X2,length/Xavg,length/Xdiag,height/X2,height/Xavg,height/Xdiag,diag/X2,diag/Xavg,diag/Xdiag,area/X2,area/Xavg,area/Xdiag
0,ANSL_69903,2.0,0.8,13,9,2016,3,7,27,192,10,6,1,2016,3,9,38,265,21,2,0,16,25,985.0,73.684028,10.52629,2.42088,0.20174,6366300.0,106105.0,1768.416667,49.429617,4.119135,47.008737,3.917395,0,0,1,0,1,...,0,0,0,0,0,0,0,1,0,1,0,1,11.0,4,117,0.88464,1.444444,15.811388,6.299605,1,0,0,0,0,1,0,1,0,0.088889,0.070204,0.049358,0.008644,0.007048,0.004908,8.930824,2.117103,1.805582,0.02772,0.022427,0.015655
1,ANSL_66892,1.0,0.72,13,9,2013,4,11,47,325,21,3,0,2018,4,12,52,361,27,3,0,17,47,1067.0,1862.740972,266.105853,61.200139,5.100012,160940820.0,2682347.0,44705.783333,81.036015,6.753001,19.835877,1.65299,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1,0,1,11.0,4,117,0.88464,1.444444,15.811388,6.299605,1,0,0,0,0,1,0,1,0,0.08,0.063402,0.04453,0.015767,0.012818,0.008935,8.153887,2.037494,1.73011,0.035515,0.028644,0.020014
2,ANSL_69750,3.0,0.15,15,4,2014,3,9,39,271,28,6,1,2016,4,10,42,293,19,2,0,8,24,504.0,752.35,107.478571,24.718372,2.059864,65003040.0,1083384.0,18056.4,70.818146,5.901512,46.099774,3.841648,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1,1,0,9.5,11,60,0.64366,3.75,15.524175,6.223084,0,0,1,1,0,0,0,1,0,0.0375,0.015666,0.009616,0.10225,0.042152,0.026005,10.890965,1.720204,1.33663,0.061922,0.025738,0.015829
3,ANSL_71623,1.0,0.62,0,1,2016,4,12,52,366,31,5,1,2019,1,1,4,25,25,4,0,18,30,1110.0,755.770833,107.967262,24.830763,2.06923,65298600.0,1088310.0,18138.5,43.712867,3.642739,18.882104,1.573509,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,1,0.5,1,0,0.0,0.0,1.0,1.0,0,1,0,0,1,0,0,0,1,0.62,0.806476,0.482426,0.1782,0.304834,0.163988,64.510095,4.86769,4.182204,0.332391,0.509695,0.286975
4,ANSL_57969,2.0,0.5,18,4,2017,3,9,39,271,28,3,0,2017,4,11,46,323,19,6,1,9,38,578.0,52.401389,7.485913,1.721642,0.14347,4527480.0,75458.0,1257.633333,34.809194,2.900766,33.087552,2.757296,0,0,1,0,1,...,0,0,1,0,0,0,0,0,1,0,1,0,11.0,14,72,0.605707,4.5,18.439089,6.979532,0,0,1,1,0,0,0,1,0,0.125,0.044452,0.026755,0.02765,0.010004,0.00598,12.802157,1.732598,1.328977,0.05879,0.021153,0.012673


#### Feature Tools

In [49]:
feature_df = pd.DataFrame()
feature_df = combined_df[['pet_id','height(m)','length(m)','diag(cm)','area(m)','pet_id_num','petid%X1','petid%X2',
                          'length_scale_10','length_scale_100','height_scale_10','height_scale_100','height_scale_1000',
                          'pet_id_0','pet_id_1','pet_id_2','pet_id_3','pet_id_4',
                          'Xavg','Xdiff','Xmul','Xdiv','Xmod','Xdiag','Xcbrt','condition','X1','X2']]
feature_df.head()

Unnamed: 0,pet_id,height(m),length(m),diag(cm),area(m),pet_id_num,petid%X1,petid%X2,length_scale_10,length_scale_100,height_scale_10,height_scale_100,height_scale_1000,pet_id_0,pet_id_1,pet_id_2,pet_id_3,pet_id_4,Xavg,Xdiff,Xmul,Xdiv,Xmod,Xdiag,Xcbrt,condition,X1,X2
0,ANSL_69903,0.0778,0.8,80.377412,0.249479,69903,2,0,8,80,1,8,78,6,9,9,0,3,11.0,4,117,0.88464,1.444444,15.811388,6.299605,2.0,13,9
1,ANSL_66892,0.1419,0.72,73.384986,0.319637,66892,7,4,7,72,1,14,142,6,6,8,9,2,11.0,4,117,0.88464,1.444444,15.811388,6.299605,1.0,13,9
2,ANSL_69750,0.409,0.15,43.563861,0.247689,69750,0,2,2,15,4,41,409,6,9,7,5,0,9.5,11,60,0.64366,3.75,15.524175,6.223084,3.0,15,4
3,ANSL_71623,0.1782,0.62,64.510095,0.332391,71623,0,0,6,62,2,18,178,7,1,6,2,3,0.5,1,0,0.0,0.0,1.0,1.0,1.0,0,1
4,ANSL_57969,0.1106,0.5,51.208628,0.23516,57969,9,1,5,50,1,11,111,5,7,9,6,9,11.0,14,72,0.605707,4.5,18.439089,6.979532,2.0,18,4


In [50]:
es = ft.EntitySet(id='Buddy')
es.entity_from_dataframe(entity_id='Breed_Category', dataframe=feature_df, index='pet_id')
es = es.normalize_entity(base_entity_id='Breed_Category', new_entity_id='condition', index='condition')
es = es.normalize_entity(base_entity_id='Breed_Category', new_entity_id='X1', index='X1')
es = es.normalize_entity(base_entity_id='Breed_Category', new_entity_id='X2', index='X2')
feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='Breed_Category', max_depth=2, verbose=3, n_jobs=1)
print(feature_matrix.shape)
feature_matrix.head()

Built 474 features
Elapsed: 00:18 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks
(26906, 474)


Unnamed: 0_level_0,height(m),length(m),diag(cm),area(m),pet_id_num,petid%X1,petid%X2,length_scale_10,length_scale_100,height_scale_10,height_scale_100,height_scale_1000,pet_id_0,pet_id_1,pet_id_2,pet_id_3,pet_id_4,Xavg,Xdiff,Xmul,Xdiv,Xmod,Xdiag,Xcbrt,condition,X1,X2,condition.SUM(Breed_Category.height(m)),condition.SUM(Breed_Category.length(m)),condition.SUM(Breed_Category.diag(cm)),condition.SUM(Breed_Category.area(m)),condition.SUM(Breed_Category.pet_id_num),condition.SUM(Breed_Category.petid%X1),condition.SUM(Breed_Category.petid%X2),condition.SUM(Breed_Category.length_scale_10),condition.SUM(Breed_Category.length_scale_100),condition.SUM(Breed_Category.height_scale_10),condition.SUM(Breed_Category.height_scale_100),condition.SUM(Breed_Category.height_scale_1000),condition.SUM(Breed_Category.pet_id_0),...,X2.MIN(Breed_Category.pet_id_1),X2.MIN(Breed_Category.pet_id_2),X2.MIN(Breed_Category.pet_id_3),X2.MIN(Breed_Category.pet_id_4),X2.MIN(Breed_Category.Xavg),X2.MIN(Breed_Category.Xdiff),X2.MIN(Breed_Category.Xmul),X2.MIN(Breed_Category.Xdiv),X2.MIN(Breed_Category.Xmod),X2.MIN(Breed_Category.Xdiag),X2.MIN(Breed_Category.Xcbrt),X2.MEAN(Breed_Category.height(m)),X2.MEAN(Breed_Category.length(m)),X2.MEAN(Breed_Category.diag(cm)),X2.MEAN(Breed_Category.area(m)),X2.MEAN(Breed_Category.pet_id_num),X2.MEAN(Breed_Category.petid%X1),X2.MEAN(Breed_Category.petid%X2),X2.MEAN(Breed_Category.length_scale_10),X2.MEAN(Breed_Category.length_scale_100),X2.MEAN(Breed_Category.height_scale_10),X2.MEAN(Breed_Category.height_scale_100),X2.MEAN(Breed_Category.height_scale_1000),X2.MEAN(Breed_Category.pet_id_0),X2.MEAN(Breed_Category.pet_id_1),X2.MEAN(Breed_Category.pet_id_2),X2.MEAN(Breed_Category.pet_id_3),X2.MEAN(Breed_Category.pet_id_4),X2.MEAN(Breed_Category.Xavg),X2.MEAN(Breed_Category.Xdiff),X2.MEAN(Breed_Category.Xmul),X2.MEAN(Breed_Category.Xdiv),X2.MEAN(Breed_Category.Xmod),X2.MEAN(Breed_Category.Xdiag),X2.MEAN(Breed_Category.Xcbrt),X2.COUNT(Breed_Category),X2.NUM_UNIQUE(Breed_Category.condition),X2.NUM_UNIQUE(Breed_Category.X1),X2.MODE(Breed_Category.condition),X2.MODE(Breed_Category.X1)
pet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
ANSL_49970,0.4595,0.78,90.528462,0.598674,49970,0,4,8,78,5,46,460,4,9,9,7,0,3.5,7,0,0.0,0.0,7.0,3.659306,2.0,0,7,1666.0789,3047.46,367055.451273,2055.391497,387134965,26087,19155,30508,304746,16822,166580,1666074,35929,...,0,0,0,0,3.5,7,0,0.0,0.0,7.0,3.659306,0.274825,0.500182,60.373899,0.337672,63595.910742,0.0,3.002539,5.000781,50.018164,2.781641,27.482813,274.821875,5.9,4.097461,4.489453,4.499023,4.514258,3.5,7.0,0.0,0.0,0.0,7.0,3.659306,5120,4,1,1.0,0
ANSL_49971,0.0938,0.45,45.96721,0.205451,49971,12,3,4,45,1,9,94,4,9,9,7,1,11.0,4,117,0.88464,1.444444,15.811388,6.299605,2.0,13,9,1666.0789,3047.46,367055.451273,2055.391497,387134965,26087,19155,30508,304746,16822,166580,1666074,35929,...,0,0,0,0,6.0,4,27,0.808968,0.333333,9.486833,4.481405,0.273321,0.504561,60.566045,0.33934,63288.935056,6.343724,3.989199,5.049221,50.456112,2.761553,27.32554,273.321438,5.873393,4.052775,4.520372,4.570823,4.479218,11.275636,4.552912,121.961444,0.873979,1.505697,16.281021,6.420832,7314,4,4,2.0,13
ANSL_49972,0.2681,0.17,31.745489,0.213488,49972,0,0,2,17,3,27,268,4,9,9,7,2,0.5,1,0,0.0,0.0,1.0,1.0,1.0,0,1,2665.1375,4910.57,590283.003307,3299.567293,618074698,14307,12966,49086,491057,26955,266478,2665114,57329,...,0,0,0,0,0.5,1,0,0.0,0.0,1.0,1.0,0.274817,0.504802,60.717067,0.339898,63459.301341,0.573659,0.0,5.048071,50.480216,2.774526,27.479807,274.817364,5.881295,4.151161,4.461494,4.453728,4.504088,1.154554,1.947106,1.309107,0.093896,1.309107,2.140722,1.499028,12232,4,3,1.0,0
ANSL_49973,0.3674,0.89,96.285137,0.571827,49973,1,5,9,89,4,37,367,4,9,9,7,3,11.0,4,117,0.88464,1.444444,15.811388,6.299605,2.0,13,9,1666.0789,3047.46,367055.451273,2055.391497,387134965,26087,19155,30508,304746,16822,166580,1666074,35929,...,0,0,0,0,6.0,4,27,0.808968,0.333333,9.486833,4.481405,0.273321,0.504561,60.566045,0.33934,63288.935056,6.343724,3.989199,5.049221,50.456112,2.761553,27.32554,273.321438,5.873393,4.052775,4.520372,4.570823,4.479218,11.275636,4.552912,121.961444,0.873979,1.505697,16.281021,6.420832,7314,4,4,2.0,13
ANSL_49974,0.2317,0.72,75.636294,0.408441,49974,2,6,7,72,2,23,232,4,9,9,7,4,11.0,4,117,0.88464,1.444444,15.811388,6.299605,2.0,13,9,1666.0789,3047.46,367055.451273,2055.391497,387134965,26087,19155,30508,304746,16822,166580,1666074,35929,...,0,0,0,0,6.0,4,27,0.808968,0.333333,9.486833,4.481405,0.273321,0.504561,60.566045,0.33934,63288.935056,6.343724,3.989199,5.049221,50.456112,2.761553,27.32554,273.321438,5.873393,4.052775,4.520372,4.570823,4.479218,11.275636,4.552912,121.961444,0.873979,1.505697,16.281021,6.420832,7314,4,4,2.0,13


In [51]:
feature_matrix.drop(['height(m)','length(m)','diag(cm)','area(m)','pet_id_num','petid%X1','petid%X2',
                     'length_scale_10','length_scale_100','height_scale_10','height_scale_100','height_scale_1000',
                     'pet_id_0','pet_id_1','pet_id_2','pet_id_3','pet_id_4',
                     'Xavg','Xdiff','Xmul','Xdiv','Xmod','Xdiag','Xcbrt','condition','X1','X2'], axis=1, inplace=True)
feature_df = feature_matrix.copy()

In [52]:
combined_df.set_index('pet_id', inplace = True)
combined_df.drop(['condition','pet_id_num'], axis=1, inplace=True)
print(combined_df.shape)
combined_df.head()

(26906, 219)


Unnamed: 0_level_0,length(m),X1,X2,iss_dt_year,iss_dt_quarter,iss_dt_month,iss_dt_week,iss_dt_day_year,iss_dt_day_month,iss_dt_day_week,iss_dt_day_weekend,lst_dt_year,lst_dt_quarter,lst_dt_month,lst_dt_week,lst_dt_day_year,lst_dt_day_month,lst_dt_day_week,lst_dt_day_weekend,lst_dt_hour,lst_dt_minute,lst_dt_minutes_elapsed,dt_diff_days,dt_diff_weeks,dt_diff_mnths,dt_diff_yrs,dt_diff_sec,dt_diff_min,dt_diff_hrs,curr_iss_dt_diff_mnths,curr_iss_dt_diff_yrs,curr_lst_dt_diff_mnths,curr_lst_dt_diff_yrs,condition_0.0,condition_1.0,condition_2.0,condition_3.0,condition_odd_even_EVEN,condition_odd_even_ODD,condition*X1,...,X2_2,X2_3,X2_4,X2_5,X2_6,X2_7,X2_8,X2_9,X1_odd_even_EVEN,X1_odd_even_ODD,X2_odd_even_EVEN,X2_odd_even_ODD,Xavg,Xdiff,Xmul,Xdiv,Xmod,Xdiag,Xcbrt,X1_cat_6to13,X1_cat_<=6,X1_cat_>13,X2_cat_4to7,X2_cat_<=3,X2_cat_>7,X1=X2,X1>X2,X1<X2,length/X2,length/Xavg,length/Xdiag,height/X2,height/Xavg,height/Xdiag,diag/X2,diag/Xavg,diag/Xdiag,area/X2,area/Xavg,area/Xdiag
pet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
ANSL_69903,0.8,13,9,2016,3,7,27,192,10,6,1,2016,3,9,38,265,21,2,0,16,25,985.0,73.684028,10.52629,2.42088,0.20174,6366300.0,106105.0,1768.416667,49.429617,4.119135,47.008737,3.917395,0,0,1,0,1,0,2.962496,...,0,0,0,0,0,0,0,1,0,1,0,1,11.0,4,117,0.88464,1.444444,15.811388,6.299605,1,0,0,0,0,1,0,1,0,0.088889,0.070204,0.049358,0.008644,0.007048,0.004908,8.930824,2.117103,1.805582,0.02772,0.022427,0.015655
ANSL_66892,0.72,13,9,2013,4,11,47,325,21,3,0,2018,4,12,52,361,27,3,0,17,47,1067.0,1862.740972,266.105853,61.200139,5.100012,160940820.0,2682347.0,44705.783333,81.036015,6.753001,19.835877,1.65299,0,1,0,0,0,1,2.351335,...,0,0,0,0,0,0,0,1,0,1,0,1,11.0,4,117,0.88464,1.444444,15.811388,6.299605,1,0,0,0,0,1,0,1,0,0.08,0.063402,0.04453,0.015767,0.012818,0.008935,8.153887,2.037494,1.73011,0.035515,0.028644,0.020014
ANSL_69750,0.15,15,4,2014,3,9,39,271,28,6,1,2016,4,10,42,293,19,2,0,8,24,504.0,752.35,107.478571,24.718372,2.059864,65003040.0,1083384.0,18056.4,70.818146,5.901512,46.099774,3.841648,0,0,0,1,0,1,3.556893,...,0,0,1,0,0,0,0,0,0,1,1,0,9.5,11,60,0.64366,3.75,15.524175,6.223084,0,0,1,1,0,0,0,1,0,0.0375,0.015666,0.009616,0.10225,0.042152,0.026005,10.890965,1.720204,1.33663,0.061922,0.025738,0.015829
ANSL_71623,0.62,0,1,2016,4,12,52,366,31,5,1,2019,1,1,4,25,25,4,0,18,30,1110.0,755.770833,107.967262,24.830763,2.06923,65298600.0,1088310.0,18138.5,43.712867,3.642739,18.882104,1.573509,0,1,0,0,0,1,0.0,...,0,0,0,0,0,0,0,0,1,0,0,1,0.5,1,0,0.0,0.0,1.0,1.0,0,1,0,0,1,0,0,0,1,0.62,0.806476,0.482426,0.1782,0.304834,0.163988,64.510095,4.86769,4.182204,0.332391,0.509695,0.286975
ANSL_57969,0.5,18,4,2017,3,9,39,271,28,3,0,2017,4,11,46,323,19,6,1,9,38,578.0,52.401389,7.485913,1.721642,0.14347,4527480.0,75458.0,1257.633333,34.809194,2.900766,33.087552,2.757296,0,0,1,0,1,0,3.301927,...,0,0,1,0,0,0,0,0,1,0,1,0,11.0,14,72,0.605707,4.5,18.439089,6.979532,0,0,1,1,0,0,0,1,0,0.125,0.044452,0.026755,0.02765,0.010004,0.00598,12.802157,1.732598,1.328977,0.05879,0.021153,0.012673


In [53]:
temp_df = pd.merge(combined_df, feature_df, how='inner', on='pet_id', sort=False, suffixes=('_x', '_y'))
print(temp_df.shape)
temp_df.head()

(26906, 666)


Unnamed: 0_level_0,length(m),X1,X2,iss_dt_year,iss_dt_quarter,iss_dt_month,iss_dt_week,iss_dt_day_year,iss_dt_day_month,iss_dt_day_week,iss_dt_day_weekend,lst_dt_year,lst_dt_quarter,lst_dt_month,lst_dt_week,lst_dt_day_year,lst_dt_day_month,lst_dt_day_week,lst_dt_day_weekend,lst_dt_hour,lst_dt_minute,lst_dt_minutes_elapsed,dt_diff_days,dt_diff_weeks,dt_diff_mnths,dt_diff_yrs,dt_diff_sec,dt_diff_min,dt_diff_hrs,curr_iss_dt_diff_mnths,curr_iss_dt_diff_yrs,curr_lst_dt_diff_mnths,curr_lst_dt_diff_yrs,condition_0.0,condition_1.0,condition_2.0,condition_3.0,condition_odd_even_EVEN,condition_odd_even_ODD,condition*X1,...,X2.MIN(Breed_Category.pet_id_1),X2.MIN(Breed_Category.pet_id_2),X2.MIN(Breed_Category.pet_id_3),X2.MIN(Breed_Category.pet_id_4),X2.MIN(Breed_Category.Xavg),X2.MIN(Breed_Category.Xdiff),X2.MIN(Breed_Category.Xmul),X2.MIN(Breed_Category.Xdiv),X2.MIN(Breed_Category.Xmod),X2.MIN(Breed_Category.Xdiag),X2.MIN(Breed_Category.Xcbrt),X2.MEAN(Breed_Category.height(m)),X2.MEAN(Breed_Category.length(m)),X2.MEAN(Breed_Category.diag(cm)),X2.MEAN(Breed_Category.area(m)),X2.MEAN(Breed_Category.pet_id_num),X2.MEAN(Breed_Category.petid%X1),X2.MEAN(Breed_Category.petid%X2),X2.MEAN(Breed_Category.length_scale_10),X2.MEAN(Breed_Category.length_scale_100),X2.MEAN(Breed_Category.height_scale_10),X2.MEAN(Breed_Category.height_scale_100),X2.MEAN(Breed_Category.height_scale_1000),X2.MEAN(Breed_Category.pet_id_0),X2.MEAN(Breed_Category.pet_id_1),X2.MEAN(Breed_Category.pet_id_2),X2.MEAN(Breed_Category.pet_id_3),X2.MEAN(Breed_Category.pet_id_4),X2.MEAN(Breed_Category.Xavg),X2.MEAN(Breed_Category.Xdiff),X2.MEAN(Breed_Category.Xmul),X2.MEAN(Breed_Category.Xdiv),X2.MEAN(Breed_Category.Xmod),X2.MEAN(Breed_Category.Xdiag),X2.MEAN(Breed_Category.Xcbrt),X2.COUNT(Breed_Category),X2.NUM_UNIQUE(Breed_Category.condition),X2.NUM_UNIQUE(Breed_Category.X1),X2.MODE(Breed_Category.condition),X2.MODE(Breed_Category.X1)
pet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
ANSL_69903,0.8,13,9,2016,3,7,27,192,10,6,1,2016,3,9,38,265,21,2,0,16,25,985.0,73.684028,10.52629,2.42088,0.20174,6366300.0,106105.0,1768.416667,49.429617,4.119135,47.008737,3.917395,0,0,1,0,1,0,2.962496,...,0,0,0,0,6.0,4,27,0.808968,0.333333,9.486833,4.481405,0.273321,0.504561,60.566045,0.33934,63288.935056,6.343724,3.989199,5.049221,50.456112,2.761553,27.32554,273.321438,5.873393,4.052775,4.520372,4.570823,4.479218,11.275636,4.552912,121.961444,0.873979,1.505697,16.281021,6.420832,7314,4,4,2.0,13
ANSL_66892,0.72,13,9,2013,4,11,47,325,21,3,0,2018,4,12,52,361,27,3,0,17,47,1067.0,1862.740972,266.105853,61.200139,5.100012,160940820.0,2682347.0,44705.783333,81.036015,6.753001,19.835877,1.65299,0,1,0,0,0,1,2.351335,...,0,0,0,0,6.0,4,27,0.808968,0.333333,9.486833,4.481405,0.273321,0.504561,60.566045,0.33934,63288.935056,6.343724,3.989199,5.049221,50.456112,2.761553,27.32554,273.321438,5.873393,4.052775,4.520372,4.570823,4.479218,11.275636,4.552912,121.961444,0.873979,1.505697,16.281021,6.420832,7314,4,4,2.0,13
ANSL_69750,0.15,15,4,2014,3,9,39,271,28,6,1,2016,4,10,42,293,19,2,0,8,24,504.0,752.35,107.478571,24.718372,2.059864,65003040.0,1083384.0,18056.4,70.818146,5.901512,46.099774,3.841648,0,0,0,1,0,1,3.556893,...,0,0,0,0,2.0,0,0,0.0,0.0,4.0,2.519842,0.275164,0.511067,61.295001,0.34244,63203.620553,6.961604,1.569735,5.109543,51.106719,2.786561,27.516093,275.166573,5.868436,4.017504,4.515528,4.56917,4.512705,9.436194,11.388481,59.489554,0.665207,3.718097,15.602776,6.180631,1771,4,9,3.0,15
ANSL_71623,0.62,0,1,2016,4,12,52,366,31,5,1,2019,1,1,4,25,25,4,0,18,30,1110.0,755.770833,107.967262,24.830763,2.06923,65298600.0,1088310.0,18138.5,43.712867,3.642739,18.882104,1.573509,0,1,0,0,0,1,0.0,...,0,0,0,0,0.5,1,0,0.0,0.0,1.0,1.0,0.274817,0.504802,60.717067,0.339898,63459.301341,0.573659,0.0,5.048071,50.480216,2.774526,27.479807,274.817364,5.881295,4.151161,4.461494,4.453728,4.504088,1.154554,1.947106,1.309107,0.093896,1.309107,2.140722,1.499028,12232,4,3,1.0,0
ANSL_57969,0.5,18,4,2017,3,9,39,271,28,3,0,2017,4,11,46,323,19,6,1,9,38,578.0,52.401389,7.485913,1.721642,0.14347,4527480.0,75458.0,1257.633333,34.809194,2.900766,33.087552,2.757296,0,0,1,0,1,0,3.301927,...,0,0,0,0,2.0,0,0,0.0,0.0,4.0,2.519842,0.275164,0.511067,61.295001,0.34244,63203.620553,6.961604,1.569735,5.109543,51.106719,2.786561,27.516093,275.166573,5.868436,4.017504,4.515528,4.56917,4.512705,9.436194,11.388481,59.489554,0.665207,3.718097,15.602776,6.180631,1771,4,9,3.0,15


In [54]:
temp_df.fillna(0, inplace=True)
temp_df[temp_df.isin([np.nan, np.inf, -np.inf]).any(1)].head()

Unnamed: 0_level_0,length(m),X1,X2,iss_dt_year,iss_dt_quarter,iss_dt_month,iss_dt_week,iss_dt_day_year,iss_dt_day_month,iss_dt_day_week,iss_dt_day_weekend,lst_dt_year,lst_dt_quarter,lst_dt_month,lst_dt_week,lst_dt_day_year,lst_dt_day_month,lst_dt_day_week,lst_dt_day_weekend,lst_dt_hour,lst_dt_minute,lst_dt_minutes_elapsed,dt_diff_days,dt_diff_weeks,dt_diff_mnths,dt_diff_yrs,dt_diff_sec,dt_diff_min,dt_diff_hrs,curr_iss_dt_diff_mnths,curr_iss_dt_diff_yrs,curr_lst_dt_diff_mnths,curr_lst_dt_diff_yrs,condition_0.0,condition_1.0,condition_2.0,condition_3.0,condition_odd_even_EVEN,condition_odd_even_ODD,condition*X1,...,X2.MIN(Breed_Category.pet_id_1),X2.MIN(Breed_Category.pet_id_2),X2.MIN(Breed_Category.pet_id_3),X2.MIN(Breed_Category.pet_id_4),X2.MIN(Breed_Category.Xavg),X2.MIN(Breed_Category.Xdiff),X2.MIN(Breed_Category.Xmul),X2.MIN(Breed_Category.Xdiv),X2.MIN(Breed_Category.Xmod),X2.MIN(Breed_Category.Xdiag),X2.MIN(Breed_Category.Xcbrt),X2.MEAN(Breed_Category.height(m)),X2.MEAN(Breed_Category.length(m)),X2.MEAN(Breed_Category.diag(cm)),X2.MEAN(Breed_Category.area(m)),X2.MEAN(Breed_Category.pet_id_num),X2.MEAN(Breed_Category.petid%X1),X2.MEAN(Breed_Category.petid%X2),X2.MEAN(Breed_Category.length_scale_10),X2.MEAN(Breed_Category.length_scale_100),X2.MEAN(Breed_Category.height_scale_10),X2.MEAN(Breed_Category.height_scale_100),X2.MEAN(Breed_Category.height_scale_1000),X2.MEAN(Breed_Category.pet_id_0),X2.MEAN(Breed_Category.pet_id_1),X2.MEAN(Breed_Category.pet_id_2),X2.MEAN(Breed_Category.pet_id_3),X2.MEAN(Breed_Category.pet_id_4),X2.MEAN(Breed_Category.Xavg),X2.MEAN(Breed_Category.Xdiff),X2.MEAN(Breed_Category.Xmul),X2.MEAN(Breed_Category.Xdiv),X2.MEAN(Breed_Category.Xmod),X2.MEAN(Breed_Category.Xdiag),X2.MEAN(Breed_Category.Xcbrt),X2.COUNT(Breed_Category),X2.NUM_UNIQUE(Breed_Category.condition),X2.NUM_UNIQUE(Breed_Category.X1),X2.MODE(Breed_Category.condition),X2.MODE(Breed_Category.X1)
pet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1


In [55]:
combined_df = temp_df.copy()

### Create train and predict Numpy arrays

In [65]:
# Segregate combined_df into train/predict datasets
train_x = combined_df[:18834].values
predict_x = combined_df[18834:].values

print("train_x: {}".format(train_x.shape))
print("predict_x: {}".format(predict_x.shape))

train_x: (18834, 666)
predict_x: (8072, 666)


In [66]:
# Scale the train_x/predict_x arrays
#scaler = MinMaxScaler().fit(train_x)
scaler = StandardScaler().fit(train_x)
train_x_pc = scaler.transform(train_x)
Xpredict = scaler.transform(predict_x)

### Feature Selection

In [67]:
pca = PCA(.999)
pca.fit(train_x_pc)

PCA(copy=True, iterated_power='auto', n_components=0.999, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [68]:
train_x_pc = pca.transform(train_x_pc)
Xpredict = pca.transform(Xpredict)

print("train_x_pc: {} \nXpredict: {}".format(train_x_pc.shape, Xpredict.shape))

train_x_pc: (18834, 100) 
Xpredict: (8072, 100)


### Split training data into train/ validation/ test datasets

In [69]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=10)
for train_index, test_index in sss.split(train_x_pc, train_y_pc):
    Xtrain_pc, Xvalidation_pc = train_x_pc[train_index], train_x_pc[test_index]
    Ytrain_pc, Yvalidation_pc = train_y_pc[train_index], train_y_pc[test_index]

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=20)
for train_index, test_index in sss.split(Xvalidation_pc, Yvalidation_pc):
    Xvalidation_pc, Xtest_pc = Xvalidation_pc[train_index], Xvalidation_pc[test_index]
    Yvalidation_pc, Ytest_pc = Yvalidation_pc[train_index], Yvalidation_pc[test_index]


print("------------------------- Training Dataset -------------------------")
print("Xtrain_pc shape: {}".format(Xtrain_pc.shape))
print("Ytrain_pc shape: {}".format(Ytrain_pc.shape))

print("\n------------------------- Validation Dataset -------------------------")
print("Xvalidation_pc shape: {}".format(Xvalidation_pc.shape))
print("Yvalidation_pc shape: {}".format(Yvalidation_pc.shape))

print("\n------------------------- Test Dataset -------------------------")
print("Xtest_pc shape: {}".format(Xtest_pc.shape))
print("Ytest_pc shape: {}".format(Ytest_pc.shape))

print("\n------------------------- Prediction Dataset -------------------------")
print("Xpredict shape: {}".format(Xpredict.shape))

------------------------- Training Dataset -------------------------
Xtrain_pc shape: (15067, 100)
Ytrain_pc shape: (15067, 1)

------------------------- Validation Dataset -------------------------
Xvalidation_pc shape: (2260, 100)
Yvalidation_pc shape: (2260, 1)

------------------------- Test Dataset -------------------------
Xtest_pc shape: (1507, 100)
Ytest_pc shape: (1507, 1)

------------------------- Prediction Dataset -------------------------
Xpredict shape: (8072, 100)


### Handling class imbalance

In [70]:
#sm = SMOTE()
sm = ADASYN()
sm_x, sm_y = sm.fit_sample(Xtrain_pc, Ytrain_pc.ravel())
Xtrain_pc = sm_x
Ytrain_pc = np.array([sm_y]).T
print("Class balancing done.")
print("Xtrain_pc shape: {}".format(Xtrain_pc.shape))
print("Ytrain_pc shape: {}".format(Ytrain_pc.shape))


Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.


Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.


Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.



Class balancing done.
Xtrain_pc shape: (34542, 100)
Ytrain_pc shape: (34542, 1)


In [71]:
temp_df = pd.DataFrame(Ytrain_pc, columns=['Class'])
temp_df.groupby(['Class']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,Class,count
0,0,8506
1,1,8944
2,2,8497
3,3,8595


In [72]:
temp_df = pd.DataFrame(Yvalidation_pc, columns=['Class'])
temp_df.groupby(['Class']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,Class,count
0,0,11
1,1,862
2,2,1274
3,3,113


In [73]:
temp_df = pd.DataFrame(Ytest_pc, columns=['Class'])
temp_df.groupby(['Class']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,Class,count
0,0,7
1,1,575
2,2,850
3,3,75


## Save the datasets in NPZ file (for reusability)

In [74]:
np.savez_compressed('/content/drive/My Drive/Colab Notebooks/Adopt a Buddy/NPZ Files/Pet_Category_Dataset.npz',
                    Xtrain_pc=Xtrain_pc, Ytrain_pc=Ytrain_pc,
                    Xvalidation_pc=Xvalidation_pc, Yvalidation_pc=Yvalidation_pc,
                    Xtest_pc=Xtest_pc, Ytest_pc=Ytest_pc,
                    Xpredict=Xpredict)

## Load datasets from the NPZ file

In [75]:
processed_dataset = np.load('/content/drive/My Drive/Colab Notebooks/Adopt a Buddy/NPZ Files/Pet_Category_Dataset.npz', allow_pickle=True)
Xtrain_pc, Ytrain_pc = processed_dataset['Xtrain_pc'], processed_dataset['Ytrain_pc']
Xvalidation_pc, Yvalidation_pc = processed_dataset['Xvalidation_pc'], processed_dataset['Yvalidation_pc']
Xtest_pc, Ytest_pc = processed_dataset['Xtest_pc'], processed_dataset['Ytest_pc']
Xpredict = processed_dataset['Xpredict']

print("------------------------- Training Dataset -------------------------")
print("Xtrain_pc shape: {}".format(Xtrain_pc.shape))
print("Ytrain_pc shape: {}".format(Ytrain_pc.shape))

print("\n------------------------- Validation Dataset -------------------------")
print("Xvalidation_pc shape: {}".format(Xvalidation_pc.shape))
print("Yvalidation_pc shape: {}".format(Yvalidation_pc.shape))

print("\n------------------------- Test Dataset -------------------------")
print("Xtest_pc shape: {}".format(Xtest_pc.shape))
print("Ytest_pc shape: {}".format(Ytest_pc.shape))

print("\n------------------------- Prediction Dataset -------------------------")
print("Xpredict shape: {}".format(Xpredict.shape))

------------------------- Training Dataset -------------------------
Xtrain_pc shape: (34542, 100)
Ytrain_pc shape: (34542, 1)

------------------------- Validation Dataset -------------------------
Xvalidation_pc shape: (2260, 100)
Yvalidation_pc shape: (2260, 1)

------------------------- Test Dataset -------------------------
Xtest_pc shape: (1507, 100)
Ytest_pc shape: (1507, 1)

------------------------- Prediction Dataset -------------------------
Xpredict shape: (8072, 100)


## Pet Category Prediction

### Hyperparameter search using Optuna

In [76]:
def objective(trial):
    
    params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_class": 4,
        "is_unbalance": True,
        "max_bin": 512,
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 1e-1),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-4, 1.0),
        "num_leaves": trial.suggest_int("num_leaves", 50, 3000),
        "max_depth": trial.suggest_int("max_depth", 7, 12),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.5, 0.7),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.5, 0.7),
        "bagging_freq": trial.suggest_int("bagging_freq", 5, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 15),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-3, 1e-1)
    }
    
    lgtrain = lgb.Dataset(Xtrain_pc, label=Ytrain_pc.ravel())
    lgvalidation = lgb.Dataset(Xvalidation_pc, label=Yvalidation_pc.ravel())
        
    model = lgb.train(params, lgtrain, valid_sets=[lgvalidation], 
                      num_boost_round=5000, early_stopping_rounds=100, 
                      verbose_eval=False)
    
    y_pred = model.predict(Xtest_pc, num_iteration=model.best_iteration)
    y_pred_binary = np.array([np.argmax(y_pred, axis=1)]).T
    f1 = f1_score(Ytest_pc, y_pred_binary, average='weighted')
    return f1

In [77]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

[I 2020-08-22 11:43:03,832] Trial 0 finished with value: 0.8759541525860076 and parameters: {'learning_rate': 0.029152492317020103, 'lambda_l2': 0.0007235030305763655, 'num_leaves': 588, 'max_depth': 10, 'feature_fraction': 0.6312077278218008, 'bagging_fraction': 0.6690764164365671, 'bagging_freq': 5, 'min_child_samples': 15, 'min_child_weight': 0.00575969932111108}. Best is trial 0 with value: 0.8759541525860076.
[I 2020-08-22 11:45:16,579] Trial 1 finished with value: 0.8687382093894845 and parameters: {'learning_rate': 0.033359217563393585, 'lambda_l2': 0.05306053706030071, 'num_leaves': 615, 'max_depth': 7, 'feature_fraction': 0.6711065526616959, 'bagging_fraction': 0.5346309846082608, 'bagging_freq': 11, 'min_child_samples': 7, 'min_child_weight': 0.004568900290824592}. Best is trial 0 with value: 0.8759541525860076.
[I 2020-08-22 11:47:30,177] Trial 2 finished with value: 0.8772003482146279 and parameters: {'learning_rate': 0.06956970762744775, 'lambda_l2': 0.7811146611783196, 'n

KeyboardInterrupt: ignored

In [78]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print(" {}: {}".format(key, value))

Number of finished trials: 23
Best trial:
Value: 0.8772003482146279
Params: 
 learning_rate: 0.06956970762744775
 lambda_l2: 0.7811146611783196
 num_leaves: 2633
 max_depth: 9
 feature_fraction: 0.6483677482716714
 bagging_fraction: 0.5562956851898881
 bagging_freq: 14
 min_child_samples: 2
 min_child_weight: 0.0031567280177617935


### Build the model for pet-category prediction

In [79]:
# Define model hyperparameters
params = {}
params["objective"] = 'multiclass'
params['metric'] = 'multi_logloss'
params['boosting'] = 'gbdt'
params['num_class'] = 4
params['is_unbalance'] = True
params['max_bin'] = 512
params["learning_rate"] = 0.0696
params["lambda_l2"] = 0.7811
params["num_leaves"] = 2633
params["max_depth"] = 9
params["feature_fraction"] = 0.6484
params["bagging_fraction"] = 0.5563
params["bagging_freq"] = 14
params["bagging_seed"] = 10
params["min_data_in_leaf"] = 2
params["min_child_weight"] = 0.003157
params["verbosity"] = -1
num_rounds = 5000

In [81]:
# Define K-fold cross validation test harness
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=25)
y_pred = 0

In [82]:
Xtrain_tmp = np.concatenate((Xtrain_pc, Xvalidation_pc), axis=0)
Ytrain_tmp = np.concatenate((Ytrain_pc, Yvalidation_pc), axis=0)

In [83]:
# Train the model using K-fold
counter = 0

for train, val in kfold.split(Xtrain_tmp, Ytrain_tmp):
    counter += 1

    train_x, train_y = Xtrain_tmp[train], Ytrain_tmp[train]
    val_x, val_y = Xtrain_tmp[val], Ytrain_tmp[val]
    
    lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
    lgvalidation = lgb.Dataset(val_x, label=val_y.ravel())

    model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtrain, lgvalidation], 
                      early_stopping_rounds=200, verbose_eval=100)
    pred = model.predict(Xtest_pc, num_iteration=model.best_iteration)

    y_pred += pred

y_pred /= float(counter)

Training until validation scores don't improve for 200 rounds.
[100]	training's multi_logloss: 0.0927452	valid_1's multi_logloss: 0.162467
[200]	training's multi_logloss: 0.02222	valid_1's multi_logloss: 0.111995
[300]	training's multi_logloss: 0.00893555	valid_1's multi_logloss: 0.101926
[400]	training's multi_logloss: 0.0039146	valid_1's multi_logloss: 0.0998253
[500]	training's multi_logloss: 0.00245752	valid_1's multi_logloss: 0.101079
[600]	training's multi_logloss: 0.00183161	valid_1's multi_logloss: 0.102116
Early stopping, best iteration is:
[401]	training's multi_logloss: 0.00389169	valid_1's multi_logloss: 0.0998101
Training until validation scores don't improve for 200 rounds.
[100]	training's multi_logloss: 0.0872585	valid_1's multi_logloss: 0.166498
[200]	training's multi_logloss: 0.0215118	valid_1's multi_logloss: 0.11565
[300]	training's multi_logloss: 0.00768692	valid_1's multi_logloss: 0.103796
[400]	training's multi_logloss: 0.0034013	valid_1's multi_logloss: 0.101074

### Validate the model

In [84]:
# Print log_loss, accuracy and F1-score
y_pred_binary = np.array([np.argmax(y_pred, axis=1)]).T
f1 = f1_score(Ytest_pc, y_pred_binary, average='weighted')
print('Overall F1-Score:', f1)

Overall F1-Score: 0.8743913066242341


In [85]:
print(classification_report(Ytest_pc, y_pred_binary))

              precision    recall  f1-score   support

           0       0.50      0.29      0.36         7
           1       0.86      0.85      0.85       575
           2       0.89      0.91      0.90       850
           3       0.85      0.77      0.81        75

    accuracy                           0.88      1507
   macro avg       0.78      0.70      0.73      1507
weighted avg       0.87      0.88      0.87      1507



### Train model on entire data

In [86]:
# Define K-fold cross validation test harness
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=60)
y_pred_pc = 0

In [87]:
Xtrain_tmp = np.concatenate((Xtrain_pc, Xvalidation_pc, Xtest_pc), axis=0)
Ytrain_tmp = np.concatenate((Ytrain_pc, Yvalidation_pc, Ytest_pc), axis=0)

In [88]:
# Train the model using K-fold
counter = 0

for train, val in kfold.split(Xtrain_tmp, Ytrain_tmp):
    counter += 1

    train_x, train_y = Xtrain_tmp[train], Ytrain_tmp[train]
    val_x, val_y = Xtrain_tmp[val], Ytrain_tmp[val]
    
    lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
    lgvalidation = lgb.Dataset(val_x, label=val_y.ravel())

    model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtrain, lgvalidation], 
                      early_stopping_rounds=200, verbose_eval=100)
    pred = model.predict(Xpredict, num_iteration=model.best_iteration)

    y_pred_pc += pred

y_pred_pc /= float(counter)

Training until validation scores don't improve for 200 rounds.
[100]	training's multi_logloss: 0.0943728	valid_1's multi_logloss: 0.166107
[200]	training's multi_logloss: 0.0245194	valid_1's multi_logloss: 0.11898
[300]	training's multi_logloss: 0.00858854	valid_1's multi_logloss: 0.108979
[400]	training's multi_logloss: 0.00384363	valid_1's multi_logloss: 0.106193
[500]	training's multi_logloss: 0.00215154	valid_1's multi_logloss: 0.106435
[600]	training's multi_logloss: 0.00134757	valid_1's multi_logloss: 0.107875
Early stopping, best iteration is:
[426]	training's multi_logloss: 0.0032504	valid_1's multi_logloss: 0.105833
Training until validation scores don't improve for 200 rounds.
[100]	training's multi_logloss: 0.094525	valid_1's multi_logloss: 0.166162
[200]	training's multi_logloss: 0.0234522	valid_1's multi_logloss: 0.11532
[300]	training's multi_logloss: 0.00823136	valid_1's multi_logloss: 0.104186
[400]	training's multi_logloss: 0.00387173	valid_1's multi_logloss: 0.101776


## Create submission file

In [89]:
y_pred_pc_binary = np.array([np.argmax(y_pred_pc, axis=1)]).T
predict_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Adopt a Buddy/Dataset/test.csv")
submit_df = pd.DataFrame()
submit_df['pet_id'] = predict_df['pet_id']
submit_df['pet_category'] = y_pred_pc_binary
submit_df['pet_category'] = submit_df['pet_category'].apply(lambda x: 4 if x==3 else x)
submit_df.head()

Unnamed: 0,pet_id,pet_category
0,ANSL_75005,2
1,ANSL_76663,1
2,ANSL_58259,2
3,ANSL_67171,2
4,ANSL_72871,2


In [90]:
submit_df.to_csv("/content/drive/My Drive/Colab Notebooks/Adopt a Buddy/Predictions/predictions_v57_pc_LGB.csv", index=False)