# Machine Learning Final Project
# Pet Adoption Prediction

### Author: Siran Fang, Jiaying Du, Yanan Wu

The goal of this project is predicting the speed at which a pet is adopted, based on the pet’s listing on PetFinder. Sometimes a profile represents a group of pets. In this case, the speed of adoption is determined by the speed at which all of the pets are adopted. The data included text, tabular, and image data.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud

## Import Data

In [2]:
df = pd.read_csv('train.csv')
breed = pd.read_csv('breed_labels.csv')
image = pd.read_csv('img_features.csv')

In [3]:
image

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,246,247,248,249,250,251,252,253,254,255
0,86e1089a3,0.002000,0.167800,0.019715,0.015896,0.068162,0.002216,0.005042,0.004828,0.050760,...,0.787699,0.176626,0.575706,1.088628,0.439556,0.520460,1.547071,0.832573,0.599093,0.763348
1,6296e909a,0.002858,0.107450,0.019916,0.023482,0.174765,0.002297,0.005031,0.006338,0.083378,...,0.628259,0.686865,0.564000,0.968190,1.070276,1.545742,0.894409,0.838595,0.468238,0.916672
2,3422e4906,0.002734,0.072015,0.024455,0.018021,0.154207,0.001946,0.004211,0.001576,0.100046,...,0.579116,0.557625,1.131405,0.720513,1.496671,0.870955,1.289683,1.184462,0.465114,0.892826
3,5842f1ff5,0.002106,0.274519,0.054815,0.013727,0.089969,0.001650,0.005506,0.004295,0.118727,...,1.295853,0.326143,0.291669,1.608086,1.119176,1.470889,0.591444,0.832755,0.483021,1.134126
4,850a43f90,0.002185,0.174022,0.044818,0.016244,0.169775,0.002075,0.004421,0.004157,0.099671,...,1.092663,0.669894,0.395784,0.886075,1.219730,1.033966,1.065686,0.304054,0.438069,0.676817
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14988,dc0935a84,0.002556,0.094772,0.016550,0.020388,0.129347,0.002535,0.004151,0.005106,0.156753,...,1.236135,0.195013,0.346327,0.849923,1.388868,0.933430,0.977991,0.762702,0.239418,0.880098
14989,a01ab5b30,0.002194,0.120590,0.026668,0.018456,0.078149,0.002028,0.009793,0.003089,0.198529,...,0.905649,0.281715,1.530005,1.329902,0.533165,1.057960,1.615979,0.942733,1.817291,0.461272
14990,d981b6395,0.002495,0.091517,0.045028,0.021851,0.081367,0.002516,0.004439,0.004840,0.108002,...,0.793473,0.326840,0.928414,0.805518,0.943638,0.980733,1.558713,0.406569,0.963960,0.767038
14991,e4da1c9e4,0.001927,0.089233,0.022045,0.014881,0.110134,0.003246,0.004836,0.005267,0.057644,...,0.384883,0.159538,1.090614,1.247143,0.290542,0.896352,1.413012,0.383327,1.020868,0.384756


### Name

In [551]:
df['Unnamed/YN'] = 0
df.loc[df['Name'] == 'No Name Yet', 'Unnamed/YN'] = 1
df.loc[df['Name'] == 'No Name', 'Unnamed/YN'] = 1
df.loc[df['Name'] == 'Unknown', 'Unnamed/YN'] = 1
df.loc[df['Name'].isnull(), 'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("Name",na = False),'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("name",na = False),'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("Unknown",na = False),'Unnamed/YN'] = 1
df.loc[df['Name'].str.len() == 3, 'Unnamed/YN'] = 1
df.loc[df['Name'].str.len() == 2, 'Unnamed/YN'] = 1
df.loc[df['Name'].str.len() == 1, 'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("Puppies",na = False),'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("Doggie",na = False),'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("Kitty",na = False),'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("Kittens",na = False),'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("!",na = False),'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("Zone",na = False),'Unnamed/YN'] = 1

In [552]:
df['Unnamed/YN'].value_counts()

0    11966
1     3027
Name: Unnamed/YN, dtype: int64

### Pure_Breeds

In [553]:
# Pure_Breed
df['Pure_Breed/YN'] = 0
df.loc[df['Breed2'] == 0, 'Pure_Breed/YN'] = 1
df.loc[df['Breed1'] == 307, 'Pure_Breed/YN'] = 0

print(f"Rate of pure breed pets in train data: {df['Pure_Breed/YN'].sum() * 100 / df['Pure_Breed/YN'].shape[0]:.4f}%.")

Rate of pure breed pets in train data: 41.5327%.


In [554]:
# Pure_BreedScore
df['Pure_BreedScore'] = 0
df.loc[df['Pure_Breed/YN'] == 1, 'Pure_BreedScore'] = df['Breed1']

In [555]:
breeds_dict = {k: v for k, v in zip(breed['BreedID'], breed['BreedName'])}

In [556]:
df['Pure_BreedName'] = df['Pure_BreedScore'].apply(lambda x: '_'.join(breeds_dict[x].split()) 
                                                   if x in breeds_dict else 'Mixed_Breed')

In [557]:
a = df['Pure_BreedName'].value_counts()
a.head(10)

Mixed_Breed             8766
Domestic_Short_Hair     3058
Domestic_Medium_Hair     984
Tabby                    247
Domestic_Long_Hair       178
Shih_Tzu                 149
Poodle                   121
Siamese                  120
Golden_Retriever          94
Persian                   76
Name: Pure_BreedName, dtype: int64

In [558]:
df['Breed1_name'] = df['Breed1'].apply(lambda x: '_'.join(breeds_dict[x].split()) 
                                                   if x in breeds_dict else 'Other')
df['Breed2_name'] = df['Breed2'].apply(lambda x: '_'.join(breeds_dict[x].split()) 
                                                   if x in breeds_dict else 'Pure')

In [559]:
df_1 = pd.get_dummies(df['Breed1_name'])
df_2 = pd.get_dummies(df['Breed2_name'])

In [560]:
a = df_1.columns.difference(df_2.columns)
column2 = ['Airedale_Terrier', 'American_Bulldog', 'American_Water_Spaniel',
       'American_Wirehair', 'Australian_Kelpie', 'Bedlington_Terrier',
       'Black_Labrador_Retriever', 'Black_Mouth_Cur', 'Boston_Terrier',
       'Burmilla', 'Cattle_Dog', 'Chinese_Crested_Dog',
       'Chocolate_Labrador_Retriever', 'Coonhound', 'Dutch_Shepherd',
       'English_Bulldog', 'English_Pointer', 'English_Springer_Spaniel',
       'Extra-Toes_Cat_(Hemingway_Polydactyl)', 'Field_Spaniel',
       'Flat-coated_Retriever', 'Foxhound', 'French_Bulldog', 'German_Spitz',
       'Glen_of_Imaal_Terrier', 'Greyhound', 'Irish_Terrier',
       'Irish_Wolfhound', 'Javanese', 'Kai_Dog', 'Kuvasz', 'Lancashire_Heeler',
       'Lhasa_Apso', 'Manchester_Terrier', 'Mountain_Dog',
       'Norwegian_Forest_Cat', 'Ocicat', 'Old_English_Sheepdog', 'Pixie-Bob',
       'Ragamuffin', 'Rat_Terrier', 'Scottish_Terrier_Scottie', 'Setter',
       'Somali', 'Sphynx_(hairless_cat)', 'Staffordshire_Bull_Terrier',
       'Standard_Poodle', 'Swedish_Vallhund', 'Toy_Fox_Terrier',
       'West_Highland_White_Terrier_Westie', 'Wheaten_Terrier', 'Whippet',
       'White_German_Shepherd']

d = dict.fromkeys(column2, 0)
df_2 = df_2.assign(**d)
df_2 = df_2.drop(['Pure'], axis=1)

Unnamed: 0,Abyssinian,Affenpinscher,Afghan_Hound,Akbash,Akita,American_Curl,American_Shorthair,American_Staffordshire_Terrier,Applehead_Siamese,Australian_Cattle_Dog/Blue_Heeler,...,Somali,Sphynx_(hairless_cat),Staffordshire_Bull_Terrier,Standard_Poodle,Swedish_Vallhund,Toy_Fox_Terrier,West_Highland_White_Terrier_Westie,Wheaten_Terrier,Whippet,White_German_Shepherd
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14989,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14990,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14991,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [561]:
column1 = ['Afghan_Hound', 'Akbash', 'Australian_Cattle_Dog/Blue_Heeler',
       'Bluetick_Coonhound', 'Border_Terrier', 'Finnish_Spitz',
       'German_Shorthaired_Pointer', 'Harrier', 'Munchkin', 'Norfolk_Terrier',
       'Selkirk_Rex', 'Smooth_Fox_Terrier']
df_1 = df_1.assign(**dict.fromkeys(column1, 0))
df_1 = df_1.drop(['Other'], axis=1)

Unnamed: 0,Abyssinian,Affenpinscher,Airedale_Terrier,Akita,American_Bulldog,American_Curl,American_Shorthair,American_Staffordshire_Terrier,American_Water_Spaniel,American_Wirehair,...,Australian_Cattle_Dog/Blue_Heeler,Bluetick_Coonhound,Border_Terrier,Finnish_Spitz,German_Shorthaired_Pointer,Harrier,Munchkin,Norfolk_Terrier,Selkirk_Rex,Smooth_Fox_Terrier
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14989,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14990,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14991,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [562]:
df_cob = df_1+df_2
df_cob

Unnamed: 0,Abyssinian,Affenpinscher,Afghan_Hound,Airedale_Terrier,Akbash,Akita,American_Bulldog,American_Curl,American_Shorthair,American_Staffordshire_Terrier,...,Tuxedo,Weimaraner,Welsh_Corgi,West_Highland_White_Terrier_Westie,Wheaten_Terrier,Whippet,White_German_Shepherd,Wirehaired_Terrier,Yellow_Labrador_Retriever,Yorkshire_Terrier_Yorkie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14989,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14990,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14991,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [563]:
sum_col = pd.DataFrame(df_cob.sum(axis=0))
sum_col
a = sum_col.sort_values([0],ascending=False)
a.head(15)

Unnamed: 0,0
Mixed_Breed,7654
Domestic_Short_Hair,4233
Domestic_Medium_Hair,1579
Tabby,480
Domestic_Long_Hair,421
Siamese,369
Persian,299
Labrador_Retriever,291
Terrier,253
Shih_Tzu,222


In [564]:
df_breed = df_cob[["Tabby","Domestic_Short_Hair", "Domestic_Medium_Hair",
                   "Labrador_Retriever","Terrier","Mixed_Breed"]]

df_breed['sum'] = df_breed.sum(axis=1)
df_breed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,Tabby,Domestic_Short_Hair,Domestic_Medium_Hair,Labrador_Retriever,Terrier,Mixed_Breed,sum
0,1,0,0,0,0,0,1
1,0,0,1,0,0,0,1
2,0,0,0,0,0,1,1
3,0,0,0,0,0,1,1
4,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...
14988,0,1,0,0,0,0,1
14989,0,0,1,0,0,0,1
14990,0,1,1,0,0,0,2
14991,0,1,0,0,0,0,1


In [565]:
df_breed['Other'] = df_breed['sum'].apply(lambda x: 1 if x == 0  else 0)
df_breed = df_breed.drop(['sum'],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [566]:
df_final = pd.concat([df, df_breed], axis=1)

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Pure_BreedName,Breed1_name,Breed2_name,Tabby,Domestic_Short_Hair,Domestic_Medium_Hair,Labrador_Retriever,Terrier,Mixed_Breed,Other
0,2,Nibble,3,299,0,1,1,7,0,1,...,Tabby,Tabby,Pure,1,0,0,0,0,0,0
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,Domestic_Medium_Hair,Domestic_Medium_Hair,Pure,0,0,1,0,0,0,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,Mixed_Breed,Mixed_Breed,Pure,0,0,0,0,0,1,0
3,1,Miko,4,307,0,2,1,2,0,2,...,Mixed_Breed,Mixed_Breed,Pure,0,0,0,0,0,1,0
4,1,Hunter,1,307,0,1,1,0,0,2,...,Mixed_Breed,Mixed_Breed,Pure,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14988,2,,2,266,0,3,1,0,0,2,...,Domestic_Short_Hair,Domestic_Short_Hair,Pure,0,1,0,0,0,0,0
14989,2,Serato & Eddie,60,265,264,3,1,4,7,2,...,Mixed_Breed,Domestic_Medium_Hair,Domestic_Long_Hair,0,0,1,0,0,0,0
14990,2,Monkies,2,265,266,3,5,6,7,3,...,Mixed_Breed,Domestic_Medium_Hair,Domestic_Short_Hair,0,1,1,0,0,0,0
14991,2,Ms Daym,9,266,0,2,4,7,0,1,...,Domestic_Short_Hair,Domestic_Short_Hair,Pure,0,1,0,0,0,0,0


In [569]:
df_final.loc[df_final['Tabby'] == 2, 'Tabby'] = 1
df_final.loc[df_final['Domestic_Short_Hair'] == 2, 'Domestic_Short_Hair'] = 1
df_final.loc[df_final['Domestic_Medium_Hair'] == 2, 'Domestic_Medium_Hair'] = 1
df_final.loc[df_final['Labrador_Retriever'] == 2, 'Labrador_Retriever'] = 1
df_final.loc[df_final['Terrier'] == 2, 'Terrier'] = 1
df_final.loc[df_final['Mixed_Breed'] == 2, 'Mixed_Breed'] = 1

In [571]:
df_final.to_csv('df_final.csv', index=False)

## Modeling

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

Using TensorFlow backend.


In [3]:
clean_features = pd.read_csv('clean_features.csv')
image = pd.read_csv('img_features.csv')

In [4]:
image.rename(columns={'Unnamed: 0': 'PetID'}, inplace=True)

In [5]:
clean_df = pd.merge(clean_features, image, on="PetID")

In [6]:
clean_df

Unnamed: 0,PetID,Type,Age,Gender,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,...,246,247,248,249,250,251,252,253,254,255
0,86e1089a3,2,3,1,1,1,2,2,2,1,...,0.787699,0.176626,0.575706,1.088628,0.439556,0.520460,1.547071,0.832573,0.599093,0.763348
1,6296e909a,2,1,1,2,2,3,3,3,1,...,0.628259,0.686865,0.564000,0.968190,1.070276,1.545742,0.894409,0.838595,0.468238,0.916672
2,3422e4906,1,1,1,2,2,1,1,2,1,...,0.579116,0.557625,1.131405,0.720513,1.496671,0.870955,1.289683,1.184462,0.465114,0.892826
3,5842f1ff5,1,4,2,2,1,1,1,2,1,...,1.295853,0.326143,0.291669,1.608086,1.119176,1.470889,0.591444,0.832755,0.483021,1.134126
4,850a43f90,1,1,1,2,1,2,2,2,1,...,1.092663,0.669894,0.395784,0.886075,1.219730,1.033966,1.065686,0.304054,0.438069,0.676817
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14988,dc0935a84,2,2,3,2,2,2,2,2,1,...,1.236135,0.195013,0.346327,0.849923,1.388868,0.933430,0.977991,0.762702,0.239418,0.880098
14989,a01ab5b30,2,60,3,2,2,1,1,1,1,...,0.905649,0.281715,1.530005,1.329902,0.533165,1.057960,1.615979,0.942733,1.817291,0.461272
14990,d981b6395,2,2,3,3,2,2,1,3,1,...,0.793473,0.326840,0.928414,0.805518,0.943638,0.980733,1.558713,0.406569,0.963960,0.767038
14991,e4da1c9e4,2,9,2,1,1,1,1,1,1,...,0.384883,0.159538,1.090614,1.247143,0.290542,0.896352,1.413012,0.383327,1.020868,0.384756


In [7]:
clean_df = clean_df.set_index('PetID')

In [8]:
clean_df['AdoptionSpeed'].value_counts()

4    4197
2    4037
3    3259
1    3090
0     410
Name: AdoptionSpeed, dtype: int64

In [17]:
clean_new = clean_df
clean_new['AdoptionSpeed'].value_counts()

4    4197
2    4037
3    3259
1    3090
0     410
Name: AdoptionSpeed, dtype: int64

In [18]:
predictors = clean_new.drop(["AdoptionSpeed"],axis=1)
response = clean_new['AdoptionSpeed']

X_train, X_test, y_train, y_test = train_test_split(predictors, response, 
                                                    test_size=0.2, random_state=42)

In [19]:
print("Training set:", X_train.shape[0], "rows and", X_train.shape[1], "columns")
print("Test set:", X_test.shape[0], "rows and", X_test.shape[1], "column")

Training set: 11994 rows and 322 columns
Test set: 2999 rows and 322 column


In [22]:
# SMOTE
sm = SMOTE(random_state=42)
X_train,y_train = sm.fit_sample(X_train, y_train.ravel())

#standardScaler
sc = StandardScaler()

X_train = pd.DataFrame(sc.fit_transform(X_train))
X_test = pd.DataFrame(sc.transform(X_test))

In [38]:
# function to generate classification report
def class_report(trueclass, predclass,probclass):
    y_true = trueclass
    y_pred = predclass
    y_proba = probclass
    
    print("Accuracy:",accuracy_score(y_true, y_pred))
    print("Precision:",metrics.precision_score(y_true, y_pred, average = 'weighted'))
    print("Recall:",metrics.recall_score(y_true, y_pred, average = 'weighted'))
    print('F-1 Score:',metrics.f1_score(y_true, y_pred, average='weighted'))
    print('roc-auc Score', metrics.roc_auc_score(y_true, y_proba, multi_class='ovr'))
    
    cm = pd.DataFrame(confusion_matrix(y_true, y_pred))
    report = classification_report(y_true, y_pred)

    return print (cm, "\n", report)

## Random Forest Classifier + SMOTE + Random Search:

In [42]:
# tune parameter
param_grid = {'n_estimators': [250,500,1000],
              'max_features':np.arange(start=4, stop=10, step=2),
              'max_depth':np.arange(start=10, stop=20, step=2),
              'max_leaf_nodes':np.arange(start=4, stop=20, step=2),
              'min_samples_leaf':np.arange(start=2, stop=10, step=2),
              'min_samples_split':np.arange(start=2, stop=10, step=2),
              'random_state':[42]}

# create Random Forest model 
rf_obj = RandomForestClassifier()

# Create gridsearch object with various combinations of parameters
rf = RandomizedSearchCV(rf_obj, param_grid, cv = 5,refit = True, scoring='f1_weighted',
                       n_jobs=-1, verbose = 5)
rf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  6.7min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [43]:
print("Best model parameters from Grid Search:")
print(rf.best_params_)

Best model parameters from Grid Search:
{'random_state': 42, 'n_estimators': 250, 'min_samples_split': 4, 'min_samples_leaf': 6, 'max_leaf_nodes': 18, 'max_features': 8, 'max_depth': 16}


In [44]:
y_pred_proba = rf.best_estimator_.predict_proba(X_test)
y_pred = rf.best_estimator_.predict(X_test)
y_pred_proba = rf.best_estimator_.predict_proba(X_test)

print ('Report : ')
class_report(y_test, y_pred, y_pred_proba)

Report : 
Accuracy: 0.33344448149383127
Precision: 0.34305126450099505
Recall: 0.33344448149383127
F-1 Score: 0.3300147591157682
roc-auc Score 0.6315529877708788
     0    1    2    3    4
0   19   34    9   12   18
1  104  185  112  103  123
2   78  187  157  175  209
3   42  139   92  160  208
4   62  112  105   75  479 
               precision    recall  f1-score   support

           0       0.06      0.21      0.10        92
           1       0.28      0.30      0.29       627
           2       0.33      0.19      0.25       806
           3       0.30      0.25      0.27       641
           4       0.46      0.58      0.51       833

    accuracy                           0.33      2999
   macro avg       0.29      0.30      0.28      2999
weighted avg       0.34      0.33      0.33      2999



In [54]:
cv_f1 = np.mean(cross_val_score(rf, X_train, y_train.values.ravel(), 
                                cv=StratifiedKFold(n_splits=5, shuffle=True, 
                                                   random_state=42), scoring='f1_weighted', 
                                                   verbose=True, n_jobs=-1))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 23.6min finished


In [55]:
cv_f1

0.3797716683949

### overfitting

In [45]:
y_train = pd.DataFrame(y_train)
y_pred_train = rf.best_estimator_.predict(X_train)
y_pred_train_proba = rf.best_estimator_.predict_proba(X_train)

print ('Report : ')
class_report(y_train,y_pred_train,y_pred_train_proba)

Report : 
Accuracy: 0.44797859690844233
Precision: 0.41861300935205475
Recall: 0.44797859690844233
F-1 Score: 0.41789852052334886
roc-auc Score 0.751147515096546
      0    1    2     3     4
0  2937   64   34   185   144
1   970  900  389   531   574
2   456  711  636   724   837
3   557  542  366  1006   893
4   219  500  337   252  2056 
               precision    recall  f1-score   support

           0       0.57      0.87      0.69      3364
           1       0.33      0.27      0.30      3364
           2       0.36      0.19      0.25      3364
           3       0.37      0.30      0.33      3364
           4       0.46      0.61      0.52      3364

    accuracy                           0.45     16820
   macro avg       0.42      0.45      0.42     16820
weighted avg       0.42      0.45      0.42     16820



In [46]:
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb

seed = 42

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [60]:
class EnsembleModel:
    
    def __init__(self,balancing=False):
        self.balance_ratio = 5 if balancing else 1
        self.rf_model = RandomForestClassifier()
        self.lgb_model = lgb.LGBMClassifier()
        self.rand_forest_params= {
            #'bootstrap': [True, False],
            'n_estimators': [250, 500],
            'max_features':np.arange(start=4, stop=10, step=2),
            'max_depth':np.arange(start=10, stop=16, step=2),
            'max_leaf_nodes':np.arange(start=4, stop=10, step=2),
            'min_samples_leaf':[10,20],
            'random_state' : [seed]
        }
        
        self.lgb_params = {'boosting_type': ['gbdt'], 
                           'lambda_l1':[0.1],
                           'learning_rate': [0.1],
                           'subsample': [.8],
                           'colsample_bytree': [0.8],
                           'min_split_gain': [0.0],
                           'min_child_samples': [20],
                           'min_child_weight': [0.001],
                           'max_depth': [7],
                           'n_estimators': [100],
                           'num_leaves': [5],
                           'silent': [-1],
                           'verbose': [-1],
                           'max_depth': [11],
                           'random_state': [42]
        }
        
        self.rf_best_param = None
        self.lgb_best_param = None
        self.columns = None
    
    def set_param(self,rf_param,lgb_param):
        self.rf_best_param = rf_param
        self.lgb_best_param = lgb_param
    
    def tune_best_param(self,x_train,y_train):
        weights_train = [self.balance_ratio if i==0 else 1 for i in y_train.values.tolist()]
        rf_gridsearch = GridSearchCV(estimator = self.rf_model, 
                                      param_grid = self.rand_forest_params, 
                                      cv = 5, 
                                      n_jobs = -1, 
                                      verbose = 1, 
                                      scoring='f1_weighted')
        rf_gridsearch.fit(x_train, y_train, sample_weight = weights_train)
        print('tuning for rf finished')
        self.rf_model = rf_gridsearch.best_estimator_
        self.rf_best_param = rf_gridsearch.best_params_
        
        lgb_gridsearch = GridSearchCV(self.lgb_model, self.lgb_params, n_jobs=-1, cv=5,
                                      scoring='f1_weighted', verbose=1, refit=True)
        lgb_gridsearch.fit(x_train, y_train, sample_weight = weights_train)
        print('tuning for lgb finished')
        self.lgb_model = lgb_gridsearch.best_estimator_
        self.lgb_best_param = lgb_gridsearch.best_params_
        print('best param for rf is:')
        print(self.rf_best_param)
        print('best param for lgb is:')
        print(self.lgb_best_param)
    
    # let's try combining the 2 models together by averging
    def _avg(self,y_1,y_2):
        return np.rint((y_1 + y_2)/2.0).astype(int)

    def re_fit_with_best_param(self,X,y):
        if self.rf_best_param == None or self.lgb_best_param == None:
            print('use tune_best_param() method to get best param first')
            return
        weights_train = [self.balance_ratio if i==0 else 1 for i in y.values.tolist()]
        self.rf_model = RandomForestClassifier()
        self.lgb_model =  lgb.LGBMClassifier()
        self.rf_model.set_params(**self.rf_best_param)
        self.lgb_model.set_params(**self.lgb_best_param)
        self.rf_model.fit(X,y,sample_weight=weights_train)
        self.lgb_model.fit(X,y,sample_weight=weights_train)
        print('refit finished')

    def predict(self,test_X):
        rf_result = self.rf_model.predict(test_X)
        lgb_result = self.lgb_model.predict(test_X)
        final_result = self._avg(rf_result,lgb_result)
        return final_result
    
    def get_feature_importance(self):
        rf_feature_importances = pd.DataFrame({'Feature':self.columns.tolist(),'importance':self.rf_model.feature_importances_.tolist()})
        lgb_feature_importances = pd.DataFrame({'Feature':self.columns.tolist(),'importance':self.lgb_model.feature_importances_.tolist()})
        overall_feature_importance = pd.merge(rf_feature_importances, lgb_feature_importances, on='Feature', how='outer')
        overall_feature_importance['avg_importance'] = (overall_feature_importance['importance_x'] + overall_feature_importance['importance_y'])/2
        overall_feature_importance = overall_feature_importance.sort_values(by=['avg_importance'], ascending=False)
        return overall_feature_importance

In [61]:
first_model = EnsembleModel(balancing=True)
first_model.tune_best_param(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 27.9min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 35.5min finished
  self.best_estimator_.fit(X, y, **fit_params)


tuning for rf finished
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   28.6s finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tuning for lgb finished
best param for rf is:
{'max_depth': 10, 'max_features': 8, 'max_leaf_nodes': 8, 'min_samples_leaf': 20, 'n_estimators': 250, 'random_state': 42}
best param for lgb is:
{'boosting_type': 'gbdt', 'colsample_bytree': 0.8, 'lambda_l1': 0.1, 'learning_rate': 0.1, 'max_depth': 11, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'num_leaves': 5, 'random_state': 42, 'silent': -1, 'subsample': 0.8, 'verbose': -1}


In [62]:
first_model.re_fit_with_best_param(X_test,y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


refit finished


In [64]:
def report2(trueclass, predclass):
    y_true = trueclass
    y_pred = predclass
    
    print("Accuracy:",accuracy_score(y_true, y_pred))
    print("Precision:",metrics.precision_score(y_true, y_pred, average = 'weighted'))
    print("Recall:",metrics.recall_score(y_true, y_pred, average = 'weighted'))
    print('F-1 Score:',metrics.f1_score(y_true, y_pred, average='weighted'))
    
    cm = pd.DataFrame(confusion_matrix(y_true, y_pred))
    report = classification_report(y_true, y_pred)

    return print (cm, "\n", report)

In [65]:
print ('Report : ')
report2(y_test,first_model.predict(X_test))

Report : 
Accuracy: 0.4008002667555852
Precision: 0.30018379611045054
Recall: 0.4008002667555852
F-1 Score: 0.3222812885804634
   0   1    2    3    4
0  0  45   30    4   13
1  0   6  504   35   82
2  0   0  541  129  136
3  0   0  342   38  261
4  0   0   99  117  617 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        92
           1       0.12      0.01      0.02       627
           2       0.36      0.67      0.47       806
           3       0.12      0.06      0.08       641
           4       0.56      0.74      0.64       833

    accuracy                           0.40      2999
   macro avg       0.23      0.30      0.24      2999
weighted avg       0.30      0.40      0.32      2999



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [81]:
rf_best = RandomForestClassifier(max_depth=10, 
                                  max_features= 8, 
                                  max_leaf_nodes= 8, 
                                  min_samples_leaf= 20, 
                                  n_estimators= 250, 
                                  random_state=42)
rf_best.fit(X_train,y_train)

lgb_best = lgb.LGBMClassifier(boosting_type= 'gbdt', 
                               colsample_bytree= 0.8, 
                               lambda_l1= 0.1, 
                               learning_rate= 0.1, 
                               max_depth=11, 
                               min_child_samples= 20, 
                               min_child_weight= 0.001, 
                               min_split_gain= 0.0, 
                               n_estimators= 100, 
                               num_leaves= 5, 
                               random_state= 42, 
                               silent= -1, 
                               subsample= 0.8, 
                               verbose= -1)
lgb_best.fit(X_train,y_train)

  import sys
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
               importance_type='split', lambda_l1=0.1, learning_rate=0.1,
               max_depth=11, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=5,
               objective=None, random_state=42, reg_alpha=0.0, reg_lambda=0.0,
               silent=-1, subsample=0.8, subsample_for_bin=200000,
               subsample_freq=0, verbose=-1)

In [86]:
rf_auc = metrics.roc_auc_score(y_test, rf_best.predict_proba(X_test), 
                                             multi_class='ovr')
lgb_auc = metrics.roc_auc_score(y_test, lgb_best.predict_proba(X_test), 
                                             multi_class='ovr')
print('roc-auc Score', rf_auc)
print('roc-auc Score', lgb_auc)

print('average roc-auc Score', (rf_auc+lgb_auc)/2)

roc-auc Score 0.6210371229146908
roc-auc Score 0.6771879383844523
average roc-auc Score 0.6491125306495715


In [90]:
cv_f1_rf = np.mean(cross_val_score(rf_best, X_train, y_train.values.ravel(), 
                                cv=StratifiedKFold(n_splits=5, shuffle=True, 
                                                   random_state=42), scoring='f1_weighted', 
                                           
                                   verbose=True, n_jobs=-1))
cv_f1_rf

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   29.0s finished


0.3580600380512493

In [91]:
cv_f1_lgb = np.mean(cross_val_score(lgb_best, X_train, y_train.values.ravel(), 
                                cv=StratifiedKFold(n_splits=5, shuffle=True, 
                                                   random_state=42), scoring='f1_weighted', 
                                                   verbose=True, n_jobs=-1))
cv_f1_lgb

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   30.2s finished


0.45350374837366186

In [95]:
cv_f1_ensemble = (cv_f1_rf + cv_f1_lgb)/2
print(cv_f1_ensemble)

0.4057818932124556


#### overfitting

In [66]:
print ('Report : ')
report2(y_train,first_model.predict(X_train))

Report : 
Accuracy: 0.26866825208085615
Precision: 0.1895504655056187
Recall: 0.26866825208085615
F-1 Score: 0.18716971111797567
   0  1     2    3     4
0  0  1  2026  436   901
1  0  1  2240  384   739
2  0  2  1990  440   932
3  0  1  1775  412  1176
4  0  1   807  440  2116 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00      3364
           1       0.17      0.00      0.00      3364
           2       0.23      0.59      0.33      3364
           3       0.20      0.12      0.15      3364
           4       0.36      0.63      0.46      3364

    accuracy                           0.27     16820
   macro avg       0.19      0.27      0.19     16820
weighted avg       0.19      0.27      0.19     16820



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
