In [27]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [28]:
df = pd.read_csv("../input/forest-fires-data-set/forestfires.csv")
df.head()

In [29]:
df.info()

In [30]:
df["area"].value_counts()

In [31]:
df["burn"] = 0

def set_burn(x):
    return 1 if x["area"]*1000 > 2 else 0

df.burn = df.apply(lambda x: set_burn(x), axis=1)
df.head()

In [32]:
df.burn.value_counts()

In [33]:
df["burn"].value_counts()

In [34]:
sns.scatterplot(x=df["RH"], y=df["temp"], hue=df["burn"])

In [35]:
sns.scatterplot(x=df["X"], y=df["Y"], hue=df["burn"])

In [36]:
# df["wind_ms"] = df.wind * 1000/3600 

In [37]:
sns.distplot(df.wind)

In [38]:
df.corr()["burn"]

In [39]:
df.columns

In [40]:
used_cols = ['temp', 'rain', 'wind','RH']#,"FFMC","DMC","DC","ISI"]

In [41]:
X = df[used_cols].copy()
y = df.burn.copy()

In [42]:
from sklearn.preprocessing import StandardScaler

X = StandardScaler().fit_transform(X)

In [43]:
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, train_test_split, cross_validate

In [44]:
from sklearn.linear_model import LogisticRegression

mlog = LogisticRegression()
scoring = ['precision', 'recall', 'f1', 'accuracy']
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
score = cross_validate(mlog, X, y, scoring=scoring, cv=cv, n_jobs=-1, return_train_score=False)

In [45]:
avg_precision = score["test_precision"].mean()
avg_recall = score["test_recall"].mean()
avg_f1 = score["test_f1"].mean()
avg_accuracy = score["test_accuracy"].mean()
print("avg_precision", score["test_precision"].mean())
print("avg_recall", score["test_recall"].mean())
print("avg_f1", score["test_f1"].mean())

In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=42)

In [47]:
import time
from sklearn.metrics import RocCurveDisplay, roc_curve, auc

def test(model):
    model.fit(X_train, y_train)

    start = time.time()
    y_pred = model.predict(X_test)
    inf_time = time.time() - start
    print("Inference Time", inf_time)
    

    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)
    display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                      estimator_name='validation estimator')
    display.plot()
    plt.show()
    return inf_time
    
def test_fold(model):
    scoring = ['precision', 'recall', 'f1', 'accuracy']
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
    score = cross_validate(model, X, y, scoring=scoring, cv=cv, n_jobs=-1, return_train_score=False)

    avg_precision = score["test_precision"].mean()
    avg_recall = score["test_recall"].mean()
    avg_f1 = score["test_f1"].mean()
    avg_accuracy = score["test_accuracy"].mean()
    print("avg_precision", avg_precision)
    print("avg_recall", avg_recall)
    print("avg_f1", avg_f1)
    
    return avg_precision, avg_recall, avg_f1, avg_accuracy
    
def test_many(model_dict):
    data = {}
    data['name'] = []
    data['precision'] = []
    data['f1'] = []
    data['recall'] = []
    data['accuracy'] = []
    data['time'] = []
    for name, model in model_dict.items():
        data['name'].append(name)
        x = test_fold(model)
        data['precision'].append(x[0])
        data['recall'].append(x[1])
        data['f1'].append(x[2])
        data['accuracy'].append(x[3])
        data['time'].append(test(model))
        
    
    df = pd.DataFrame.from_dict(data)
    return df
        
        

In [48]:
from sklearn.metrics import classification_report

mlog = LogisticRegression()
# mlog.fit(X_train, y_train)

test(mlog)
test_fold(mlog)

In [49]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

penalty = ['l2']
C = np.logspace(-4,4,20,100,120)
#Menjadikan ke dalam bentuk dictionary
hyperparameters = dict(penalty=penalty, C=C)
#Membuat Object Logistic Regression
logreg = LogisticRegression()
#Memasukan ke Grid Search
#CV itu Cross Validation
#Menggunakan 10-Fold CV
clf = GridSearchCV(logreg, hyperparameters, cv=10)
#Fitting Model
best_model = clf.fit(X,y)
#Nilai hyperparameters terbaik
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])
# #Prediksi menggunakan model baru
# y_pred = best_model.predict(X_test)
# #Check performa dari model
# print(classification_report(y_test, y_pred))

test(best_model)
test_fold(best_model)

# roc_auc_score(y_test, y_pred)

In [50]:
from sklearn.svm import SVC

svc = SVC()
test(svc)
test_fold(svc)

In [51]:
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

xgb = XGBClassifier()
# test_fold(xgb)
# test(xgb)

In [52]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
test_fold(rf)
test(rf)

In [53]:
models = {
    "LogisticRegression": best_model,
    "RandomForest": rf,
    "XGBoost": xgb,
    "SVC": svc,
}

res_df = test_many(models)
res_df.head()

In [62]:
tmp = {}
tmp['Score'] = []
tmp['Metric'] = []
tmp['Name'] = []
for index, row in res_df.iterrows():
    tmp['Name'].append(row['name'])
    tmp['Metric'].append('Recall')
    tmp['Score'].append(row['recall'])
    
    tmp['Name'].append(row['name'])
    tmp['Metric'].append('F1')
    tmp['Score'].append(row['f1'])
    
tmp_df = pd.DataFrame.from_dict(tmp)
plot1 = sns.barplot(x='Name', y='Score', hue='Metric', data=tmp_df)
fig = plot1.get_figure()
fig.savefig("plot1.png", dpi=600)
plot1

In [67]:
res_df['Time(s)'] = res_df.time
res_df['Name'] = res_df.name
plot2 = sns.barplot(x='Name', y='Time(s)', data=res_df)
# for p in plot2.patches:
#     plot2.annotate(format(p.get_height(), '.1f'), 
#                    (p.get_x() + p.get_width() / 2., p.get_height()), 
#                    ha = 'center', va = 'center', 
#                    xytext = (0, 9), 
#                    textcoords = 'offset points')

fig = plot2.get_figure()
fig.savefig("plot2.png", dpi=400)
plot2

In [56]:
import pickle

pickle.dump(best_model, open("logistic.pkl", 'wb'))

In [57]:
rf.predict_proba([[29.6, 0, 3.204, 44.200001]])