# Machine Learning Final Project
# Pet Adoption Prediction

### Author: Siran Fang, Jiaying Du, Yanan Wu

The goal of this project is predicting the speed at which a pet is adopted, based on the pet’s listing on PetFinder. Sometimes a profile represents a group of pets. In this case, the speed of adoption is determined by the speed at which all of the pets are adopted. The data included text, tabular, and image data.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud

## Import Data

In [550]:
df = pd.read_csv('train.csv')
breed = pd.read_csv('breed_labels.csv')

### Name

In [551]:
df['Unnamed/YN'] = 0
df.loc[df['Name'] == 'No Name Yet', 'Unnamed/YN'] = 1
df.loc[df['Name'] == 'No Name', 'Unnamed/YN'] = 1
df.loc[df['Name'] == 'Unknown', 'Unnamed/YN'] = 1
df.loc[df['Name'].isnull(), 'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("Name",na = False),'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("name",na = False),'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("Unknown",na = False),'Unnamed/YN'] = 1
df.loc[df['Name'].str.len() == 3, 'Unnamed/YN'] = 1
df.loc[df['Name'].str.len() == 2, 'Unnamed/YN'] = 1
df.loc[df['Name'].str.len() == 1, 'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("Puppies",na = False),'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("Doggie",na = False),'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("Kitty",na = False),'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("Kittens",na = False),'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("!",na = False),'Unnamed/YN'] = 1
df.loc[df['Name'].str.contains("Zone",na = False),'Unnamed/YN'] = 1

In [552]:
df['Unnamed/YN'].value_counts()

0    11966
1     3027
Name: Unnamed/YN, dtype: int64

### Pure_Breeds

In [553]:
# Pure_Breed
df['Pure_Breed/YN'] = 0
df.loc[df['Breed2'] == 0, 'Pure_Breed/YN'] = 1
df.loc[df['Breed1'] == 307, 'Pure_Breed/YN'] = 0

print(f"Rate of pure breed pets in train data: {df['Pure_Breed/YN'].sum() * 100 / df['Pure_Breed/YN'].shape[0]:.4f}%.")

Rate of pure breed pets in train data: 41.5327%.


In [554]:
# Pure_BreedScore
df['Pure_BreedScore'] = 0
df.loc[df['Pure_Breed/YN'] == 1, 'Pure_BreedScore'] = df['Breed1']

In [555]:
breeds_dict = {k: v for k, v in zip(breed['BreedID'], breed['BreedName'])}

In [556]:
df['Pure_BreedName'] = df['Pure_BreedScore'].apply(lambda x: '_'.join(breeds_dict[x].split()) 
                                                   if x in breeds_dict else 'Mixed_Breed')

In [557]:
a = df['Pure_BreedName'].value_counts()
a.head(10)

Mixed_Breed             8766
Domestic_Short_Hair     3058
Domestic_Medium_Hair     984
Tabby                    247
Domestic_Long_Hair       178
Shih_Tzu                 149
Poodle                   121
Siamese                  120
Golden_Retriever          94
Persian                   76
Name: Pure_BreedName, dtype: int64

In [558]:
df['Breed1_name'] = df['Breed1'].apply(lambda x: '_'.join(breeds_dict[x].split()) 
                                                   if x in breeds_dict else 'Other')
df['Breed2_name'] = df['Breed2'].apply(lambda x: '_'.join(breeds_dict[x].split()) 
                                                   if x in breeds_dict else 'Pure')

In [559]:
df_1 = pd.get_dummies(df['Breed1_name'])
df_2 = pd.get_dummies(df['Breed2_name'])

In [560]:
a = df_1.columns.difference(df_2.columns)
column2 = ['Airedale_Terrier', 'American_Bulldog', 'American_Water_Spaniel',
       'American_Wirehair', 'Australian_Kelpie', 'Bedlington_Terrier',
       'Black_Labrador_Retriever', 'Black_Mouth_Cur', 'Boston_Terrier',
       'Burmilla', 'Cattle_Dog', 'Chinese_Crested_Dog',
       'Chocolate_Labrador_Retriever', 'Coonhound', 'Dutch_Shepherd',
       'English_Bulldog', 'English_Pointer', 'English_Springer_Spaniel',
       'Extra-Toes_Cat_(Hemingway_Polydactyl)', 'Field_Spaniel',
       'Flat-coated_Retriever', 'Foxhound', 'French_Bulldog', 'German_Spitz',
       'Glen_of_Imaal_Terrier', 'Greyhound', 'Irish_Terrier',
       'Irish_Wolfhound', 'Javanese', 'Kai_Dog', 'Kuvasz', 'Lancashire_Heeler',
       'Lhasa_Apso', 'Manchester_Terrier', 'Mountain_Dog',
       'Norwegian_Forest_Cat', 'Ocicat', 'Old_English_Sheepdog', 'Pixie-Bob',
       'Ragamuffin', 'Rat_Terrier', 'Scottish_Terrier_Scottie', 'Setter',
       'Somali', 'Sphynx_(hairless_cat)', 'Staffordshire_Bull_Terrier',
       'Standard_Poodle', 'Swedish_Vallhund', 'Toy_Fox_Terrier',
       'West_Highland_White_Terrier_Westie', 'Wheaten_Terrier', 'Whippet',
       'White_German_Shepherd']

d = dict.fromkeys(column2, 0)
df_2 = df_2.assign(**d)
df_2 = df_2.drop(['Pure'], axis=1)

Unnamed: 0,Abyssinian,Affenpinscher,Afghan_Hound,Akbash,Akita,American_Curl,American_Shorthair,American_Staffordshire_Terrier,Applehead_Siamese,Australian_Cattle_Dog/Blue_Heeler,...,Somali,Sphynx_(hairless_cat),Staffordshire_Bull_Terrier,Standard_Poodle,Swedish_Vallhund,Toy_Fox_Terrier,West_Highland_White_Terrier_Westie,Wheaten_Terrier,Whippet,White_German_Shepherd
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14989,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14990,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14991,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [561]:
column1 = ['Afghan_Hound', 'Akbash', 'Australian_Cattle_Dog/Blue_Heeler',
       'Bluetick_Coonhound', 'Border_Terrier', 'Finnish_Spitz',
       'German_Shorthaired_Pointer', 'Harrier', 'Munchkin', 'Norfolk_Terrier',
       'Selkirk_Rex', 'Smooth_Fox_Terrier']
df_1 = df_1.assign(**dict.fromkeys(column1, 0))
df_1 = df_1.drop(['Other'], axis=1)

Unnamed: 0,Abyssinian,Affenpinscher,Airedale_Terrier,Akita,American_Bulldog,American_Curl,American_Shorthair,American_Staffordshire_Terrier,American_Water_Spaniel,American_Wirehair,...,Australian_Cattle_Dog/Blue_Heeler,Bluetick_Coonhound,Border_Terrier,Finnish_Spitz,German_Shorthaired_Pointer,Harrier,Munchkin,Norfolk_Terrier,Selkirk_Rex,Smooth_Fox_Terrier
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14989,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14990,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14991,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [562]:
df_cob = df_1+df_2
df_cob

Unnamed: 0,Abyssinian,Affenpinscher,Afghan_Hound,Airedale_Terrier,Akbash,Akita,American_Bulldog,American_Curl,American_Shorthair,American_Staffordshire_Terrier,...,Tuxedo,Weimaraner,Welsh_Corgi,West_Highland_White_Terrier_Westie,Wheaten_Terrier,Whippet,White_German_Shepherd,Wirehaired_Terrier,Yellow_Labrador_Retriever,Yorkshire_Terrier_Yorkie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14989,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14990,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14991,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [563]:
sum_col = pd.DataFrame(df_cob.sum(axis=0))
sum_col
a = sum_col.sort_values([0],ascending=False)
a.head(15)

Unnamed: 0,0
Mixed_Breed,7654
Domestic_Short_Hair,4233
Domestic_Medium_Hair,1579
Tabby,480
Domestic_Long_Hair,421
Siamese,369
Persian,299
Labrador_Retriever,291
Terrier,253
Shih_Tzu,222


In [564]:
df_breed = df_cob[["Tabby","Domestic_Short_Hair", "Domestic_Medium_Hair",
                   "Labrador_Retriever","Terrier","Mixed_Breed"]]

df_breed['sum'] = df_breed.sum(axis=1)
df_breed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,Tabby,Domestic_Short_Hair,Domestic_Medium_Hair,Labrador_Retriever,Terrier,Mixed_Breed,sum
0,1,0,0,0,0,0,1
1,0,0,1,0,0,0,1
2,0,0,0,0,0,1,1
3,0,0,0,0,0,1,1
4,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...
14988,0,1,0,0,0,0,1
14989,0,0,1,0,0,0,1
14990,0,1,1,0,0,0,2
14991,0,1,0,0,0,0,1


In [565]:
df_breed['Other'] = df_breed['sum'].apply(lambda x: 1 if x == 0  else 0)
df_breed = df_breed.drop(['sum'],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [566]:
df_final = pd.concat([df, df_breed], axis=1)

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Pure_BreedName,Breed1_name,Breed2_name,Tabby,Domestic_Short_Hair,Domestic_Medium_Hair,Labrador_Retriever,Terrier,Mixed_Breed,Other
0,2,Nibble,3,299,0,1,1,7,0,1,...,Tabby,Tabby,Pure,1,0,0,0,0,0,0
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,Domestic_Medium_Hair,Domestic_Medium_Hair,Pure,0,0,1,0,0,0,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,Mixed_Breed,Mixed_Breed,Pure,0,0,0,0,0,1,0
3,1,Miko,4,307,0,2,1,2,0,2,...,Mixed_Breed,Mixed_Breed,Pure,0,0,0,0,0,1,0
4,1,Hunter,1,307,0,1,1,0,0,2,...,Mixed_Breed,Mixed_Breed,Pure,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14988,2,,2,266,0,3,1,0,0,2,...,Domestic_Short_Hair,Domestic_Short_Hair,Pure,0,1,0,0,0,0,0
14989,2,Serato & Eddie,60,265,264,3,1,4,7,2,...,Mixed_Breed,Domestic_Medium_Hair,Domestic_Long_Hair,0,0,1,0,0,0,0
14990,2,Monkies,2,265,266,3,5,6,7,3,...,Mixed_Breed,Domestic_Medium_Hair,Domestic_Short_Hair,0,1,1,0,0,0,0
14991,2,Ms Daym,9,266,0,2,4,7,0,1,...,Domestic_Short_Hair,Domestic_Short_Hair,Pure,0,1,0,0,0,0,0


In [569]:
df_final.loc[df_final['Tabby'] == 2, 'Tabby'] = 1
df_final.loc[df_final['Domestic_Short_Hair'] == 2, 'Domestic_Short_Hair'] = 1
df_final.loc[df_final['Domestic_Medium_Hair'] == 2, 'Domestic_Medium_Hair'] = 1
df_final.loc[df_final['Labrador_Retriever'] == 2, 'Labrador_Retriever'] = 1
df_final.loc[df_final['Terrier'] == 2, 'Terrier'] = 1
df_final.loc[df_final['Mixed_Breed'] == 2, 'Mixed_Breed'] = 1

In [571]:
df_final.to_csv('df_final.csv', index=False)

## Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.preprocessing import StandardScaler

Using TensorFlow backend.


In [2]:
clean_df = pd.read_csv('clean_features.csv')

In [3]:
clean_df.head()

Unnamed: 0,PetID,Type,Age,Gender,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,...,10,11,12,13,14,15,16,17,18,19
0,86e1089a3,2,3,1,1,1,2,2,2,1,...,0.009754,-0.002826,-0.018925,-0.012456,0.000584,-0.014139,-0.00755,0.018987,0.009588,0.009349
1,6296e909a,2,1,1,2,2,3,3,3,1,...,-0.010259,0.008178,0.010272,0.003087,-0.001423,-0.005038,-0.003081,0.009825,0.003104,0.006718
2,3422e4906,1,1,1,2,2,1,1,2,1,...,0.044125,-0.050633,0.041106,0.008243,0.024158,-0.030801,0.012797,-0.050066,-0.023363,-0.057766
3,5842f1ff5,1,4,2,2,1,1,1,2,1,...,-0.000498,0.044688,-0.03498,0.002213,-0.094938,0.060292,0.026551,-0.040345,-0.087958,2.2e-05
4,850a43f90,1,1,1,2,1,2,2,2,1,...,-0.009017,0.015145,0.099046,0.026728,0.011302,0.002314,0.000365,0.017789,0.06642,0.038391


In [4]:
clean_df = clean_df.set_index('PetID')

In [5]:
# function to generate classification report
def class_report(trueclass, predclass):
    y_true = trueclass["AdoptionSpeed"]
    y_pred = predclass
    
    print("Accuracy:",accuracy_score(y_true, y_pred))
    print("Precision:",metrics.precision_score(y_true, y_pred, average = 'weighted'))
    print("Recall:",metrics.recall_score(y_true, y_pred, average = 'weighted'))
    
    cm = pd.DataFrame(confusion_matrix(y_true, y_pred))
    report = classification_report(y_true, y_pred)

    return print (cm, "\n", report)

In [6]:
clean_df['AdoptionSpeed'].value_counts()

4    4197
2    4037
3    3259
1    3090
0     410
Name: AdoptionSpeed, dtype: int64

In [7]:
clean_new = clean_df

In [8]:
from sklearn.utils import shuffle
clean_new = shuffle(clean_new)
clean_new['AdoptionSpeed'].value_counts()

4    4197
2    4037
3    3259
1    3090
0     410
Name: AdoptionSpeed, dtype: int64

In [9]:
predictors = clean_new.drop(["AdoptionSpeed"],axis=1)
response = clean_new['AdoptionSpeed']

X_train, X_test, y_train, y_test = train_test_split(predictors, response, 
                                                    test_size=0.2, random_state=42)

In [19]:
print("Training set:", X_train.shape[0], "rows and", X_train.shape[1], "columns")
print("Test set:", X_test.shape[0], "rows and", X_test.shape[1], "column")

Training set: 11994 rows and 66 columns
Test set: 2999 rows and 66 column


### Random Forest base + SMOTE

In [12]:
# apply SMOTE
smt = SMOTE(sampling_strategy="all",random_state=42)
X_train_s, y_train_s = smt.fit_sample(X_train, y_train)

# fit model
rf_s = RandomForestClassifier(random_state = 42, n_jobs=-1)
rf_s.fit(X_train_s, y_train_s)
y_pred_rf_s = rf_s.predict(X_test)

In [14]:
y_train_s.value_counts()

4    3381
3    3381
2    3381
1    3381
0    3381
Name: AdoptionSpeed, dtype: int64

In [23]:
y_test = pd.DataFrame(y_test)

In [25]:
print ('Report : ')
class_report(y_test,y_pred_rf_s)

Report : 
Accuracy: 0.41380460153384463
Precision: 0.4042877548826818
Recall: 0.41380460153384463
    0    1    2    3    4
0   8   30   18   12   21
1  12  253  191   65  117
2   7  195  310  123  182
3   6  111  175  172  175
4   7  112  122   77  498 
               precision    recall  f1-score   support

           0       0.20      0.09      0.12        89
           1       0.36      0.40      0.38       638
           2       0.38      0.38      0.38       817
           3       0.38      0.27      0.32       639
           4       0.50      0.61      0.55       816

    accuracy                           0.41      2999
   macro avg       0.37      0.35      0.35      2999
weighted avg       0.40      0.41      0.40      2999



#### OVERFITTNG

In [36]:
y_pred_rf_train_s = rf_s.predict(X_train_s)
y_train_s = pd.DataFrame(y_train_s)
print ('Report : ')
class_report(y_train_s,y_pred_rf_train_s)

Report : 
Accuracy: 0.9994084590357882
Precision: 0.9994084590254385
Recall: 0.9994084590357882
      0     1     2     3     4
0  3381     0     0     0     0
1     0  3379     1     0     1
2     0     1  3377     3     0
3     1     0     2  3378     0
4     0     0     1     0  3380 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3381
           1       1.00      1.00      1.00      3381
           2       1.00      1.00      1.00      3381
           3       1.00      1.00      1.00      3381
           4       1.00      1.00      1.00      3381

    accuracy                           1.00     16905
   macro avg       1.00      1.00      1.00     16905
weighted avg       1.00      1.00      1.00     16905



### Random Forest base no SMOTE

In [37]:
rf = RandomForestClassifier(random_state = 42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

  


In [38]:
print ('Report : ')
class_report(y_test,y_pred_rf)

Report : 
Accuracy: 0.4208069356452151
Precision: 0.4219393516390862
Recall: 0.4208069356452151
   0    1    2    3    4
0  5   21   25    4   34
1  1  214  227   59  137
2  0  152  348  101  216
3  1   82  215  144  197
4  0   78  138   49  551 
               precision    recall  f1-score   support

           0       0.71      0.06      0.10        89
           1       0.39      0.34      0.36       638
           2       0.37      0.43      0.39       817
           3       0.40      0.23      0.29       639
           4       0.49      0.68      0.56       816

    accuracy                           0.42      2999
   macro avg       0.47      0.34      0.34      2999
weighted avg       0.42      0.42      0.40      2999



#### OVERFITTING

In [39]:
y_pred_rf_train = rf.predict(X_train)
print ('Report : ')
class_report(y_train,y_pred_rf_train)

Report : 
Accuracy: 0.9991662497915624
Precision: 0.9991664132882834
Recall: 0.9991662497915624
     0     1     2     3     4
0  320     0     0     1     0
1    0  2450     1     0     1
2    0     1  3215     3     1
3    0     0     2  2618     0
4    0     0     0     0  3381 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       321
           1       1.00      1.00      1.00      2452
           2       1.00      1.00      1.00      3220
           3       1.00      1.00      1.00      2620
           4       1.00      1.00      1.00      3381

    accuracy                           1.00     11994
   macro avg       1.00      1.00      1.00     11994
weighted avg       1.00      1.00      1.00     11994



### combining SMOTE and Balanced Random Forest

In [40]:
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import SMOTE
import imblearn

In [41]:
clf1 = imblearn.ensemble.BalancedRandomForestClassifier(n_estimators = 200, 
                                                        max_depth=20, random_state=42)
clf1.fit(X_train_s, y_train_s)
pred = clf1.predict(X_test)

  This is separate from the ipykernel package so we can avoid doing imports until


In [43]:
print ('Report : ')
class_report(y_test,pred)

Report : 
Accuracy: 0.41580526842280763
Precision: 0.40315187854645956
Recall: 0.41580526842280763
    0    1    2    3    4
0   9   25   21    6   28
1  10  257  194   56  121
2   5  200  292  116  204
3   7  113  165  147  207
4   9   95  114   56  542 
               precision    recall  f1-score   support

           0       0.23      0.10      0.14        89
           1       0.37      0.40      0.39       638
           2       0.37      0.36      0.36       817
           3       0.39      0.23      0.29       639
           4       0.49      0.66      0.57       816

    accuracy                           0.42      2999
   macro avg       0.37      0.35      0.35      2999
weighted avg       0.40      0.42      0.40      2999



#### OVERFITTING

In [44]:
y_pred = clf1.predict(X_train_s)

In [45]:
print ('Report : ')
class_report(y_train_s,y_pred)

Report : 
Accuracy: 0.990002957704821
Precision: 0.990140238531784
Recall: 0.990002957704821
      0     1     2     3     4
0  3376     1     0     2     2
1    40  3322     8     9     2
2    19     6  3335    20     1
3    42     1     9  3329     0
4     0     1     2     4  3374 
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      3381
           1       1.00      0.98      0.99      3381
           2       0.99      0.99      0.99      3381
           3       0.99      0.98      0.99      3381
           4       1.00      1.00      1.00      3381

    accuracy                           0.99     16905
   macro avg       0.99      0.99      0.99     16905
weighted avg       0.99      0.99      0.99     16905



### combining SMOTETomek and Random Forest

In [73]:
sme = SMOTETomek(random_state=42)
X_train_st, y_train_st = sme.fit_resample(X_train, y_train)

clf2 = RandomForestClassifier(n_estimators=500, max_depth=30,
                             random_state=42)
clf2.fit(X_train_st, y_train_st)
pred = clf2.predict(X_test)
smotetomek = metrics.cohen_kappa_score(y_test, pred)

  


In [74]:
print ('Report : ')
class_report(y_test,pred)

Report : 
Accuracy: 0.42014004668222743
Precision: 0.40849530624770813
Recall: 0.42014004668222743
    0    1    2    3    4
0   9   30   12   11   27
1  10  273  174   62  119
2  10  213  281  112  201
3   4  123  158  158  196
4   7  109   95   66  539 
               precision    recall  f1-score   support

           0       0.23      0.10      0.14        89
           1       0.36      0.43      0.39       638
           2       0.39      0.34      0.37       817
           3       0.39      0.25      0.30       639
           4       0.50      0.66      0.57       816

    accuracy                           0.42      2999
   macro avg       0.37      0.36      0.35      2999
weighted avg       0.41      0.42      0.41      2999



### overfitting

In [75]:
y_pred_st = clf2.predict(X_train_s)
print ('Report : ')
class_report(y_train_s,y_pred_st)

Report : 
Accuracy: 0.9138124815143449
Precision: 0.914361658567132
Recall: 0.9138124815143449
      0     1     2     3     4
0  3374     2     0     1     4
1    20  3099   116    52    94
2    14   170  2883   121   193
3    21   115    86  2978   181
4     7   105    98    57  3114 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      3381
           1       0.89      0.92      0.90      3381
           2       0.91      0.85      0.88      3381
           3       0.93      0.88      0.90      3381
           4       0.87      0.92      0.89      3381

    accuracy                           0.91     16905
   macro avg       0.91      0.91      0.91     16905
weighted avg       0.91      0.91      0.91     16905



## Random Forest Classifier - Grid Search:

In [69]:
param_grid = {'n_estimators': [500,1000],
              'max_features':np.arange(start=4, stop=10, step=2),
              'max_depth':np.arange(start=10, stop=20, step=2),
              'random_state':[42]}

# create Random Forest model 
rf_obj = RandomForestClassifier()

# Create gridsearch object with various combinations of parameters
rf_grid = GridSearchCV(rf_obj, param_grid, cv = 5,refit = True, 
                       n_jobs=-1, verbose = 5)
rf_grid.fit(X_train, y_train.values.ravel())

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 19.8min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [76]:
print("Best model parameters from Grid Search:")
print(rf_grid.best_params_)

Best model parameters from Grid Search:
{'max_depth': 16, 'max_features': 8, 'n_estimators': 1000, 'random_state': 42}


In [77]:
grid_pred = rf_grid.best_estimator_.predict(X_test)
print ('Report : ')
class_report(y_test,grid_pred)

Report : 
Accuracy: 0.43147715905301764
Precision: 0.43890736492883475
Recall: 0.43147715905301764
   0    1    2    3    4
0  2   21   28    1   37
1  1  190  259   31  157
2  0  112  401   71  233
3  0   69  241  109  220
4  0   51  143   30  592 
               precision    recall  f1-score   support

           0       0.67      0.02      0.04        89
           1       0.43      0.30      0.35       638
           2       0.37      0.49      0.42       817
           3       0.45      0.17      0.25       639
           4       0.48      0.73      0.58       816

    accuracy                           0.43      2999
   macro avg       0.48      0.34      0.33      2999
weighted avg       0.44      0.43      0.40      2999



### overfitting

In [78]:
grid_pred_train = rf_grid.best_estimator_.predict(X_train)
print ('Report : ')
class_report(y_train,grid_pred_train)

Report : 
Accuracy: 0.976154744038686
Precision: 0.9766888001073081
Recall: 0.976154744038686
     0     1     2     3     4
0  242    19    35     3    22
1    0  2388    19     0    45
2    0     2  3176     3    39
3    0     5    34  2543    38
4    0     3    18     1  3359 
               precision    recall  f1-score   support

           0       1.00      0.75      0.86       321
           1       0.99      0.97      0.98      2452
           2       0.97      0.99      0.98      3220
           3       1.00      0.97      0.98      2620
           4       0.96      0.99      0.98      3381

    accuracy                           0.98     11994
   macro avg       0.98      0.94      0.96     11994
weighted avg       0.98      0.98      0.98     11994



### Random Forest Classifier + SMOTE - Grid Search:

In [79]:
param_grid2 = {'n_estimators': [500,800,1000],
              'max_features':[4,6,8],
              'max_depth':[6,8,10,12],
              'random_state':[42]}

# create Random Forest model 
rf_obj2 = RandomForestClassifier()

# Create gridsearch object with various combinations of parameters
rf_grid2 = GridSearchCV(rf_obj2, param_grid2, cv = 5,refit = True, 
                       n_jobs=-1, verbose = 5)
rf_grid2.fit(X_train_s, y_train_s.values.ravel())

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  5.4min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [80]:
print("Best model parameters from Grid Search:")
print(rf_grid2.best_params_)

Best model parameters from Grid Search:
{'max_depth': 12, 'max_features': 6, 'n_estimators': 500, 'random_state': 300}


In [81]:
grid_pred2 = rf_grid2.best_estimator_.predict(X_test)
print ('Report : ')
class_report(y_test,grid_pred2)

Report : 
Accuracy: 0.41147049016338777
Precision: 0.4048983159617692
Recall: 0.41147049016338777
    0    1    2    3    4
0  10   28   12    5   34
1  20  260  194   32  132
2  28  194  294   84  217
3  12  116  186  112  213
4  20  102  102   34  558 
               precision    recall  f1-score   support

           0       0.11      0.11      0.11        89
           1       0.37      0.41      0.39       638
           2       0.37      0.36      0.37       817
           3       0.42      0.18      0.25       639
           4       0.48      0.68      0.57       816

    accuracy                           0.41      2999
   macro avg       0.35      0.35      0.34      2999
weighted avg       0.40      0.41      0.39      2999



### overfitting

In [82]:
grid_pred_train2 = rf_grid2.best_estimator_.predict(X_train)
print ('Report : ')
class_report(y_train,grid_pred_train2)

Report : 
Accuracy: 0.9187093546773387
Precision: 0.9217543673259585
Recall: 0.9187093546773387
     0     1     2     3     4
0  262     9    22     5    23
1    8  2201    99    23   121
2   29    10  2977    34   170
3   17    30    60  2339   174
4   21    39    66    15  3240 
               precision    recall  f1-score   support

           0       0.78      0.82      0.80       321
           1       0.96      0.90      0.93      2452
           2       0.92      0.92      0.92      3220
           3       0.97      0.89      0.93      2620
           4       0.87      0.96      0.91      3381

    accuracy                           0.92     11994
   macro avg       0.90      0.90      0.90     11994
weighted avg       0.92      0.92      0.92     11994



In [89]:
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb

seed = 42

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [122]:
class EnsembleModel:
    
    def __init__(self,balancing=False):
        self.balance_ratio = 5 if balancing else 1
        self.rf_model = RandomForestClassifier()
        self.lgb_model = lgb.LGBMClassifier()
        self.rand_forest_params= {
            'bootstrap': [True, False],
            'max_depth': [30,50],
            'min_samples_leaf': [20, 30],
            'min_samples_split': [10,15],
            'n_estimators': [250,300],
            'random_state' : [seed]
        }
        self.lgb_params = {'objective' : ['multi:softprob'],
              'eta' : [0.01],
              'max_depth' : [7,8],
              'num_class' : [5],
              'num_leaves':[50],
              'lambda' : [0.75],
              'reg_alpha':[1e-5, 1e-2],
              'silent': [1]
        }
        self.rf_best_param = None
        self.lgb_best_param = None
        self.columns = None
    
    #def set_scorer(self,kappa):
        #self.kappa = kappa
        #self.scorer = make_scorer(kappa)
        
    def set_param(self,rf_param,lgb_param):
        self.rf_best_param = rf_param
        self.lgb_best_param = lgb_param
    
    def tune_best_param(self,x_train,y_train):
        weights_train = [self.balance_ratio if i==0 else 1 for i in y_train.values.tolist()]
        rf_gridsearch = GridSearchCV(estimator = self.rf_model, 
                                      param_grid = self.rand_forest_params, 
                                      cv = 10, 
                                      n_jobs = -1, 
                                      verbose = 1) 
                                      #scoring=self.scorer)
        rf_gridsearch.fit(x_train, y_train, sample_weight = weights_train)
        print('tuning for rf finished')
        self.rf_model = rf_gridsearch.best_estimator_
        self.rf_best_param = rf_gridsearch.best_params_
        
        lgb_gridsearch = GridSearchCV(self.lgb_model, self.lgb_params, n_jobs=-1, 
                   cv=10, 
                   #scoring=self.scorer,
                   verbose=1, refit=True)
        lgb_gridsearch.fit(x_train, y_train, sample_weight = weights_train)
        print('tuning for lgb finished')
        self.lgb_model = lgb_gridsearch.best_estimator_
        self.lgb_best_param = lgb_gridsearch.best_params_
        print('best param for rf is:')
        print(self.rf_best_param)
        print('best param for lgb is:')
        print(self.lgb_best_param)
    
    # let's try combining the 2 models together by averging
    def _avg(self,y_1,y_2):
        return np.rint((y_1 + y_2)/2.0).astype(int)

    def re_fit_with_best_param(self,X,y):
        if self.rf_best_param == None or self.lgb_best_param == None:
            print('use tune_best_param() method to get best param first')
            return
        weights_train = [self.balance_ratio if i==0 else 1 for i in y.values.tolist()]
        self.rf_model = RandomForestClassifier()
        self.lgb_model =  lgb.LGBMClassifier()
        self.rf_model.set_params(**self.rf_best_param)
        self.lgb_model.set_params(**self.lgb_best_param)
        self.rf_model.fit(X,y,sample_weight=weights_train)
        self.lgb_model.fit(X,y,sample_weight=weights_train)
        print('refit finished')
    
    def validate(self,x_valid, y_valid):
        rf_score = self.kappa(self.rf_model.predict(x_valid), y_valid)
        print('{} score: {}'.format('rf', round(rf_score, 4)))
        lgb_score = self.kappa(self.lgb_model.predict(x_valid), y_valid)
        print('{} score: {}'.format('lgb', round(lgb_score, 4)))
        score = kappa(self._avg(self.lgb_model.predict(x_valid), self.rf_model.predict(x_valid)) , y_valid)
        print('{} score on validation set: {}'.format('combiner', round(score, 4)))
        self.columns = x_valid.columns

    def predict(self,test_X):
        rf_result = self.rf_model.predict(test_X)
        lgb_result = self.lgb_model.predict(test_X)
        final_result = self._avg(rf_result,lgb_result)
        return final_result
    
    def get_feature_importance(self):
        rf_feature_importances = pd.DataFrame({'Feature':self.columns.tolist(),'importance':self.rf_model.feature_importances_.tolist()})
        lgb_feature_importances = pd.DataFrame({'Feature':self.columns.tolist(),'importance':self.lgb_model.feature_importances_.tolist()})
        overall_feature_importance = pd.merge(rf_feature_importances, lgb_feature_importances, on='Feature', how='outer')
        overall_feature_importance['avg_importance'] = (overall_feature_importance['importance_x'] + overall_feature_importance['importance_y'])/2
        overall_feature_importance = overall_feature_importance.sort_values(by=['avg_importance'], ascending=False)
        return overall_feature_importance

In [123]:
first_model = EnsembleModel(balancing=True)
first_model.tune_best_param(X_train, y_train)

Fitting 10 folds for each of 32 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed: 23.6min finished
  self.best_estimator_.fit(X, y, **fit_params)


tuning for rf finished
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.6min finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tuning for lgb finished
best param for rf is:
{'bootstrap': False, 'max_depth': 30, 'min_samples_leaf': 20, 'min_samples_split': 10, 'n_estimators': 250, 'random_state': 42}
best param for lgb is:
{'eta': 0.01, 'lambda': 0.75, 'max_depth': 7, 'num_class': 5, 'num_leaves': 50, 'objective': 'multi:softprob', 'reg_alpha': 1e-05, 'silent': 1}


In [124]:
first_model.re_fit_with_best_param(X_test,y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


refit finished


In [127]:
print ('Report : ')
class_report(y_test,first_model.predict(X_test))

Report : 
Accuracy: 0.7375791930643548
Precision: 0.7587532536360831
Recall: 0.7375791930643548
    0    1    2    3    4
0  14   30   45    0    0
1   0  452  183    1    2
2   0    5  698  109    5
3   0    2  157  317  163
4   0    0   35   50  731 
               precision    recall  f1-score   support

           0       1.00      0.16      0.27        89
           1       0.92      0.71      0.80       638
           2       0.62      0.85      0.72       817
           3       0.66      0.50      0.57       639
           4       0.81      0.90      0.85       816

    accuracy                           0.74      2999
   macro avg       0.80      0.62      0.64      2999
weighted avg       0.76      0.74      0.73      2999



#### overfitting

In [128]:
print ('Report : ')
class_report(y_train,first_model.predict(X_train))

Report : 
Accuracy: 0.37385359346339836
Precision: 0.3705715322025633
Recall: 0.37385359346339836
   0    1     2    3     4
0  5   41   148   35    92
1  4  400  1285  255   508
2  0  343  1631  447   799
3  0  181  1182  430   827
4  0  125   864  374  2018 
               precision    recall  f1-score   support

           0       0.56      0.02      0.03       321
           1       0.37      0.16      0.23      2452
           2       0.32      0.51      0.39      3220
           3       0.28      0.16      0.21      2620
           4       0.48      0.60      0.53      3381

    accuracy                           0.37     11994
   macro avg       0.40      0.29      0.28     11994
weighted avg       0.37      0.37      0.35     11994



In [129]:
def add_meta_feature(path,df):
    vertex_xs = []
    vertex_ys = []
    bounding_confidences = []
    bounding_importance_fracs = []
    dominant_blues = []
    dominant_greens = []
    dominant_reds = []
    dominant_pixel_fracs = []
    dominant_scores = []
    label_descriptions = []
    label_scores = []
    nf_count = 0
    nl_count = 0
    pet_id = df['PetID']
    for pet in pet_id:
        try:
            with open(path + pet + '-1.json', 'r') as f:
                data = json.load(f)
            vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
            vertex_xs.append(vertex_x)
            vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
            vertex_ys.append(vertex_y)
            bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
            bounding_confidences.append(bounding_confidence)
            bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
            bounding_importance_fracs.append(bounding_importance_frac)
            dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
            dominant_blues.append(dominant_blue)
            dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
            dominant_greens.append(dominant_green)
            dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
            dominant_reds.append(dominant_red)
            dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
            dominant_pixel_fracs.append(dominant_pixel_frac)
            dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
            dominant_scores.append(dominant_score)
            if data.get('labelAnnotations'):
                label_description = data['labelAnnotations'][0]['description']
                label_descriptions.append(label_description)
                label_score = data['labelAnnotations'][0]['score']
                label_scores.append(label_score)
            else:
                nl_count += 1
                label_descriptions.append('nothing')
                label_scores.append(-1)
        except FileNotFoundError:
            nf_count += 1
            vertex_xs.append(-1)
            vertex_ys.append(-1)
            bounding_confidences.append(-1)
            bounding_importance_fracs.append(-1)
            dominant_blues.append(-1)
            dominant_greens.append(-1)
            dominant_reds.append(-1)
            dominant_pixel_fracs.append(-1)
            dominant_scores.append(-1)
            label_descriptions.append('nothing')
            label_scores.append(-1)
    print(nf_count)
    print(nl_count)
    df.loc[:, 'vertex_x'] = vertex_xs
    df.loc[:, 'vertex_y'] = vertex_ys
    df.loc[:, 'bounding_confidence'] = bounding_confidences
    df.loc[:, 'bounding_importance'] = bounding_importance_fracs
    df.loc[:, 'dominant_blue'] = dominant_blues
    df.loc[:, 'dominant_green'] = dominant_greens
    df.loc[:, 'dominant_red'] = dominant_reds
    df.loc[:, 'dominant_pixel_frac'] = dominant_pixel_fracs
    df.loc[:, 'dominant_score'] = dominant_scores
    df.loc[:, 'label_description'] = label_descriptions
    df.loc[:, 'label_score'] = label_scores
#     df = df.drop(['label_description'])
    return df

### Gradient Descent Mini-Batch

In [304]:
t0, t1 = 10, 1000
def learning_schedule(t):
    return t0 / (t + t1)

In [353]:
def gradient_des(batch_size):
    theta_path_mgd = []
    
    # learning rate assigned
    #eta_assigned = 0.1
    m = len(X_train)

    n_iterations = 50
    minibatch_size = batch_size

    np.random.seed(42)
    theta = np.random.randn(66,1)  # random initialization
    t = 0

    for epoch in range(n_iterations):
        shuffled_indices = np.random.permutation(m)
        X_b_shuffled = np.array(X_train)[shuffled_indices]
        y_shuffled = np.array(y_train)[shuffled_indices]
        for i in range(0, m, minibatch_size):
            t += 1
            xi = X_b_shuffled[i:i+minibatch_size]
            yi = y_shuffled[i:i+minibatch_size]
            gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
            eta = learning_schedule(t)
            theta = theta - eta * gradients
            theta_path_mgd.append(theta)
    
    # fitted X_train and predict X_test
    y_fitted = X_train.dot(theta)
    y_pred = X_test.dot(theta)
    
    return y_fitted, y_pred

In [354]:
# test mean squared error
    mse_test = mean_squared_error(y_test, y_pred).round(4)
    # test r-squared
    r2_test = r2_score(y_test, y_pred).round(4)
    # test explained variance
    explained_var_test = explained_variance_score(y_test, y_pred).round(4) * 100
    # combine test metrics
    test_metrics = (mse_test, r2_test, explained_var_test)
    
    # train mean squared error
    mse_train = mean_squared_error(y_train, y_fitted).round(4)
    # train r-squared
    r2_train = r2_score(y_train, y_fitted).round(4)
    # train explained variance
    explained_var_train = explained_variance_score(y_train, y_fitted).round(4) * 100
    # combine train metrics
    train_metrics = (mse_train, r2_train, explained_var_train)
    
    model_metrics = pd.DataFrame({"Metric": pd.Series(["MSE", "R2", "Exp_variance"]),
                                  "Train": list(train_metrics),"Test": list(test_metrics)})

IndentationError: unexpected indent (<ipython-input-354-26e1098193f3>, line 2)

In [356]:
# when mini batch size is 50
print('Metrics and scatterplot are shown below when mini-batch size is')
print(gradient_des(200))

Metrics and scatterplot are shown below when mini-batch size is
(array([[nan],
       [nan],
       [nan],
       ...,
       [nan],
       [nan],
       [nan]]), array([[nan],
       [nan],
       [nan],
       ...,
       [nan],
       [nan],
       [nan]]))


In [321]:
def hypothesis(X, theta): 
    return np.dot(X, theta) 
  
# function to compute gradient of error function w.r.t. theta 
def gradient(X, y, theta): 
    h = hypothesis(X, theta) 
    grad = np.dot(X.transpose(), (h - y)) 
    return grad 
  
# function to compute the error for current values of theta 
def cost(X, y, theta): 
    h = hypothesis(X, theta) 
    J = np.dot((h - y).transpose(), (h - y)) 
    J /= 2
    return J[0] 
  
# function to create a list containing mini-batches 
def create_mini_batches(X, y, batch_size): 
    mini_batches = [] 
    data = np.hstack((X, y)) 
    np.random.shuffle(data) 
    n_minibatches = data.shape[0] // batch_size 
    i = 0
  
    for i in range(n_minibatches + 1): 
        mini_batch = data[i * batch_size:(i + 1)*batch_size, :] 
        X_mini = mini_batch[:, :-1] 
        Y_mini = mini_batch[:, -1].reshape((-1, 1)) 
        mini_batches.append((X_mini, Y_mini)) 
    if data.shape[0] % batch_size != 0: 
        mini_batch = data[i * batch_size:data.shape[0]] 
        X_mini = mini_batch[:, :-1] 
        Y_mini = mini_batch[:, -1].reshape((-1, 1)) 
        mini_batches.append((X_mini, Y_mini)) 
    return mini_batches 
  
# function to perform mini-batch gradient descent 
def gradientDescent(X, y, learning_rate = 0.001, batch_size = 32): 
    theta = np.zeros((X.shape[1], 1)) 
    error_list = [] 
    max_iters = 3
    for itr in range(max_iters): 
        mini_batches = create_mini_batches(X, y, batch_size) 
        for mini_batch in mini_batches: 
            X_mini, y_mini = mini_batch 
            theta = theta - learning_rate * gradient(X_mini, y_mini, theta) 
            error_list.append(cost(X_mini, y_mini, theta)) 
  
    return theta, error_list

In [366]:
X = X_train
y = y_train

In [367]:
def sigmoid(z):
    return 1/ (1 + np.exp(-z))
# testing the sigmoid function
sigmoid(0)


0.5

In [368]:
def costFunction(theta, X, y):
    """
    Takes in numpy array theta, x and y and return the logistic regression cost function and gradient
    """
    
    m=len(y)
    
    predictions = sigmoid(np.dot(X,theta))
    error = (-y * np.log(predictions)) - ((1-y)*np.log(1-predictions))
    cost = 1/m * sum(error)
    
    grad = 1/m * np.dot(X.transpose(),(predictions - y))
    
    return cost[0] , grad

In [369]:
def featureNormalization(X_train):
    """
    Take in numpy array of X values and return normalize X values,
    the mean and standard deviation of each feature
    """
    mean=np.mean(X,axis=0)
    std=np.std(X,axis=0)
    
    X_norm = (X - mean)/std
    
    return X_norm , mean , std

In [373]:
m , n = X.shape[0], X.shape[1]
#X, X_mean, X_std = featureNormalization(X)
X= np.append(np.ones((m,1)),X,axis=1)
y = y.reshape(m,1)
initial_theta = np.zeros((n+1,1))
cost, grad= costFunction(initial_theta,X,y)
print("Cost of initial theta is",cost)
print("Gradient at initial theta (zeros):",grad)


Cost of initial theta is nan
Gradient at initial theta (zeros): [[nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]]


In [365]:
def gradientDescent(X_train,y_train,theta,alpha,num_iters):
    """
    Take in numpy array X, y and theta and update theta by taking num_iters gradient steps
    with learning rate of alpha
    
    return theta and the list of the cost of theta during each iteration
    """
    
    m=len(y)
    J_history =[]
    
    for i in range(num_iters):
        cost, grad = costFunction(theta,X,y)
        theta = theta - (alpha * grad)
        J_history.append(cost)
    
    return theta , J_history