# Lazy Predict Playing

Playing with `lazypredict` ([GitHub](https://github.com/shankarpandala/lazypredict)) to see how well all the classic models do.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from pathlib import Path

In [None]:
sns.set()

In [None]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install lazypredict



In [None]:
DATA_DIR = Path("/content/drive/MyDrive/Work/Delivery/Current/Earthquake_damage/data")
SUBMISSIONS_DIR = Path(
    "drive/MyDrive/Work/Delivery/Current/Earthquake_damage/submissions"
)

In [None]:
train_values = pd.read_csv(DATA_DIR / "train_values.csv", index_col="building_id")
train_labels = pd.read_csv(DATA_DIR / "train_labels.csv", index_col="building_id")

In [None]:
len(train_values) * 0.3

78180.3

In [None]:
train_values.sample(n=10, random_state=1)

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
360320,13,777,5562,2,5,5,4,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
479622,26,203,8953,2,10,9,6,t,r,n,f,q,s,d,0,1,0,0,0,0,1,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
541371,11,131,1488,2,25,7,6,t,r,n,f,x,o,d,0,0,0,0,1,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
429841,16,732,504,2,5,4,5,t,r,n,f,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
373726,3,1229,11320,3,40,3,9,t,r,n,f,q,j,d,1,0,0,0,1,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
830922,6,673,2685,1,0,8,3,t,i,x,v,j,s,d,0,0,0,0,0,0,0,0,0,1,0,v,0,0,0,0,0,0,0,0,0,0,0,0
800181,26,227,3596,2,20,6,5,t,w,n,f,x,s,d,0,1,0,0,0,0,1,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
413022,6,537,6733,2,40,11,6,t,r,q,f,q,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
441802,27,181,12406,4,40,12,9,t,r,n,x,q,t,d,0,0,0,0,1,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
8197,27,422,10326,2,25,5,7,t,r,q,f,x,t,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [None]:
train_labels.sample(n=10, random_state=1)

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
360320,2
479622,1
541371,2
429841,3
373726,2
830922,1
800181,2
413022,2
441802,2
8197,3


In [None]:
train_values = pd.get_dummies(train_values)

In [None]:
X_sample = train_values.sample(frac=0.1, random_state=1)
y_sample = train_labels.sample(frac=0.1, random_state=1)

In [None]:
len(X_sample) == len(y_sample)

True

The class balance is kept after sampling (which is the most important thing).

In [None]:
train_labels.value_counts(normalize=True, sort=False)

damage_grade
1              0.10
2              0.57
3              0.33
dtype: float64

In [None]:
y_sample.value_counts(normalize=True, sort=False)

damage_grade
1              0.10
2              0.57
3              0.34
dtype: float64

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from lazypredict.Supervised import LazyClassifier

X_train, X_test, y_train, y_test = train_test_split(
    X_sample, y_sample, test_size=0.3, random_state=123, stratify=y_sample
)

clf = LazyClassifier(verbose=0, custom_metric=None)

models, predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)


  0%|          | 0/30 [00:00<?, ?it/s][A
  3%|▎         | 1/30 [00:00<00:23,  1.25it/s][A
  7%|▋         | 2/30 [00:01<00:21,  1.27it/s][A
 13%|█▎        | 4/30 [00:33<02:18,  5.32s/it][A
 23%|██▎       | 7/30 [00:33<01:26,  3.75s/it][A
 33%|███▎      | 10/30 [00:35<00:57,  2.86s/it][A
 40%|████      | 12/30 [00:43<00:57,  3.20s/it][A
 43%|████▎     | 13/30 [00:59<01:55,  6.79s/it][A
 47%|████▋     | 14/30 [01:24<03:16, 12.26s/it][A
 50%|█████     | 15/30 [01:24<02:09,  8.63s/it][A
 53%|█████▎    | 16/30 [01:32<01:59,  8.52s/it][A
 57%|█████▋    | 17/30 [01:33<01:19,  6.13s/it][A
 67%|██████▋   | 20/30 [01:33<00:43,  4.31s/it][A
 70%|███████   | 21/30 [01:33<00:27,  3.06s/it][A
 77%|███████▋  | 23/30 [01:35<00:17,  2.53s/it][A
 83%|████████▎ | 25/30 [01:36<00:08,  1.79s/it][A
 87%|████████▋ | 26/30 [01:36<00:05,  1.38s/it][A
 90%|█████████ | 27/30 [02:08<00:31, 10.43s/it][A
 97%|█████████▋| 29/30 [02:11<00:07,  7.83s/it][A
100%|██████████| 30/30 [02:12<00:00,  4.43s

                               Accuracy  ...  Time Taken
Model                                    ...            
BaggingClassifier                  0.67  ...        0.75
LGBMClassifier                     0.69  ...        1.28
RandomForestClassifier             0.68  ...        2.49
ExtraTreesClassifier               0.66  ...        2.21
DecisionTreeClassifier             0.62  ...        0.16
NearestCentroid                    0.46  ...        0.05
AdaBoostClassifier                 0.65  ...        0.80
XGBClassifier                      0.67  ...        3.53
ExtraTreeClassifier                0.57  ...        0.05
LabelSpreading                     0.58  ...       25.02
BernoulliNB                        0.57  ...        0.06
LabelPropagation                   0.57  ...       15.15
KNeighborsClassifier               0.59  ...        7.92
GaussianNB                         0.39  ...        0.07
LinearDiscriminantAnalysis         0.57  ...        0.16
PassiveAggressiveClassifier    




In [None]:
models.sort_values("F1 Score", ascending=False)

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.69,0.58,,0.68,1.28
RandomForestClassifier,0.68,0.57,,0.67,2.49
BaggingClassifier,0.67,0.59,,0.66,0.75
ExtraTreesClassifier,0.66,0.57,,0.66,2.21
XGBClassifier,0.67,0.54,,0.64,3.53
AdaBoostClassifier,0.65,0.54,,0.63,0.8
DecisionTreeClassifier,0.62,0.57,,0.62,0.16
KNeighborsClassifier,0.59,0.5,,0.58,7.92
LabelSpreading,0.58,0.51,,0.58,25.02
LabelPropagation,0.57,0.51,,0.57,15.15


In [None]:
import lightgbm as lgb