<a href="https://colab.research.google.com/github/devtico/sismic/blob/master/sismica.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install plotly==4.3.0

import pandas as pd
import os
import numpy as np
import plotly.express as px
import plotly.offline as py
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import scale
from sklearn.manifold import TSNE
import sklearn.metrics as metrics

In [None]:
raw_data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00266/seismic-bumps.arff",
                       skiprows = 154, header=None)
raw_data.columns = ["seismic", "seismoacoustic", "shift", "genergy",
                    "gpuls", "gdenergy", "gdpuls", "ghazard",
                    "nbumps", "nbumps2", "nbumps3", "nbumps4", "nbumps5",
                    "nbumps6", "nbumps7", "nbumps89", "energy",
                    "maxenergy", "class"]

#Análise exploratória

In [None]:
raw_data

Unnamed: 0,seismic,seismoacoustic,shift,genergy,gpuls,gdenergy,gdpuls,ghazard,nbumps,nbumps2,nbumps3,nbumps4,nbumps5,nbumps6,nbumps7,nbumps89,energy,maxenergy,class
0,a,a,N,15180,48,-72,-72,a,0,0,0,0,0,0,0,0,0,0,0
1,a,a,N,14720,33,-70,-79,a,1,0,1,0,0,0,0,0,2000,2000,0
2,a,a,N,8050,30,-81,-78,a,0,0,0,0,0,0,0,0,0,0,0
3,a,a,N,28820,171,-23,40,a,1,0,1,0,0,0,0,0,3000,3000,0
4,a,a,N,12640,57,-63,-52,a,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2579,b,a,W,81410,785,432,151,b,0,0,0,0,0,0,0,0,0,0,0
2580,b,a,W,42110,555,213,118,a,0,0,0,0,0,0,0,0,0,0,0
2581,b,a,W,26960,540,101,112,a,0,0,0,0,0,0,0,0,0,0,0
2582,a,a,W,16130,322,2,2,a,0,0,0,0,0,0,0,0,0,0,0


In [None]:
raw_data.describe(include='all')

Unnamed: 0,seismic,seismoacoustic,shift,genergy,gpuls,gdenergy,gdpuls,ghazard,nbumps,nbumps2,nbumps3,nbumps4,nbumps5,nbumps6,nbumps7,nbumps89,energy,maxenergy,class
count,2584,2584,2584,2584.0,2584.0,2584.0,2584.0,2584,2584.0,2584.0,2584.0,2584.0,2584.0,2584.0,2584.0,2584.0,2584.0,2584.0,2584.0
unique,2,3,2,,,,,3,,,,,,,,,,,
top,a,a,W,,,,,a,,,,,,,,,,,
freq,1682,1580,1663,,,,,2342,,,,,,,,,,,
mean,,,,90242.52,538.579334,12.375774,4.508901,,0.85952,0.393576,0.392802,0.067724,0.004644,0.0,0.0,0.0,4975.270898,4278.850619,0.065789
std,,,,229200.5,562.652536,80.319051,63.166556,,1.364616,0.783772,0.76971,0.279059,0.068001,0.0,0.0,0.0,20450.833222,19357.454882,0.247962
min,,,,100.0,2.0,-96.0,-96.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,,11660.0,190.0,-37.0,-36.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,,,25485.0,379.0,-6.0,-6.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,,,,52832.5,669.0,38.0,30.25,,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2600.0,2000.0,0.0


In [None]:
px.scatter_matrix(raw_data, height=1000)

## Pré-processamento
Selecionando somente as colunas úteis

In [None]:
cols = ["seismic", "seismoacoustic", "shift", "genergy",
                    "gpuls", "gdenergy", "gdpuls", "ghazard",
                    "nbumps", "nbumps2", "nbumps3", "nbumps4", "nbumps5",
                     "energy", "class"]
data_sub = raw_data[cols]

In [None]:
data_sub

Unnamed: 0,seismic,seismoacoustic,shift,genergy,gpuls,gdenergy,gdpuls,ghazard,nbumps,nbumps2,nbumps3,nbumps4,nbumps5,energy,class
0,a,a,N,15180,48,-72,-72,a,0,0,0,0,0,0,0
1,a,a,N,14720,33,-70,-79,a,1,0,1,0,0,2000,0
2,a,a,N,8050,30,-81,-78,a,0,0,0,0,0,0,0
3,a,a,N,28820,171,-23,40,a,1,0,1,0,0,3000,0
4,a,a,N,12640,57,-63,-52,a,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2579,b,a,W,81410,785,432,151,b,0,0,0,0,0,0,0
2580,b,a,W,42110,555,213,118,a,0,0,0,0,0,0,0
2581,b,a,W,26960,540,101,112,a,0,0,0,0,0,0,0
2582,a,a,W,16130,322,2,2,a,0,0,0,0,0,0,0


In [None]:
#px.parallel_categories(data_sub, cols, height=500)

#Transformando variáveis categóricas em nuḿericas

In [None]:
data_processed = pd.get_dummies(data_sub.drop("class", axis=1), drop_first=True)
data_processed

Unnamed: 0,genergy,gpuls,gdenergy,gdpuls,nbumps,nbumps2,nbumps3,nbumps4,nbumps5,energy,seismic_b,seismoacoustic_b,seismoacoustic_c,shift_W,ghazard_b,ghazard_c
0,15180,48,-72,-72,0,0,0,0,0,0,0,0,0,0,0,0
1,14720,33,-70,-79,1,0,1,0,0,2000,0,0,0,0,0,0
2,8050,30,-81,-78,0,0,0,0,0,0,0,0,0,0,0,0
3,28820,171,-23,40,1,0,1,0,0,3000,0,0,0,0,0,0
4,12640,57,-63,-52,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2579,81410,785,432,151,0,0,0,0,0,0,1,0,0,1,1,0
2580,42110,555,213,118,0,0,0,0,0,0,1,0,0,1,0,0
2581,26960,540,101,112,0,0,0,0,0,0,1,0,0,1,0,0
2582,16130,322,2,2,0,0,0,0,0,0,0,0,0,1,0,0


#Dividindo os dados em treinos e teste

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data_processed, data_sub["class"], test_size = 0.2, random_state=1234, stratify=data_sub["class"])

In [None]:
modelo_basico = RandomForestClassifier(n_estimators=50,
                                       min_samples_leaf = 50,
                                       min_impurity_decrease=0.01,
                                       class_weight="balanced",
                                       random_state=1234,
                                       oob_score=True)
modelo_basico.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.01,
                       min_impurity_split=None, min_samples_leaf=50,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=50, n_jobs=None, oob_score=True,
                       random_state=1234, verbose=0, warm_start=False)

Importância relativa das variáveis

In [None]:
imp = pd.DataFrame({"feature": data_processed.columns,
                    "importance":modelo_basico.feature_importances_})
imp.sort_values("importance", ascending=False)

Unnamed: 0,feature,importance
0,genergy,0.243705
4,nbumps,0.186941
9,energy,0.124785
1,gpuls,0.115469
13,shift_W,0.101904
6,nbumps3,0.091881
5,nbumps2,0.073332
2,gdenergy,0.032697
3,gdpuls,0.016948
10,seismic_b,0.009195


Desempenho do modelo:

In [None]:
y_pred = modelo_basico.predict(x_test)
metrics.accuracy_score(y_test, y_pred)

0.7524177949709865

In [None]:
pd.DataFrame(metrics.confusion_matrix(y_test, y_pred),
             index=["True_0", "True_1"],
             columns=["Pred_0", "Pred_1"])

Unnamed: 0,Pred_0,Pred_1
True_0,361,122
True_1,6,28


**Precision**: Percentual de acerto dentro dos positivos

In [None]:
metrics.precision_score(y_test, y_pred)

0.18666666666666668

**Recall: Percentual de positivos preditos corretamente**

In [None]:
metrics.recall_score(y_test, y_pred)

0.8235294117647058

#Curva ROC

In [None]:
prob_abalo = modelo_basico.predict_proba(x_test)[:,1]

falso_pos, verd_pos, cortes = metrics.roc_curve(y_test, prob_abalo)
cortes = pd.Series(cortes).apply(lambda x: "Corte: " + str(np.round(x, 3)))

px.line(x=falso_pos, y=verd_pos, hover_name=cortes, height=300, labels={"x": "Falsos positivos", "y": "Verdadeiros positivos"})

#Visualização 2D do modelo


In [None]:
#Colunas que visualmente parecem ser mais importantes
colunas_viz = ["genergy", "energy", "gdenergy", "gpuls", "gdpuls", 
               "nbumps", "nbumps2", "nbumps3"]
data_viz = data_processed[colunas_viz]

#Ajustando a escala devido às diferentes unidades e ordens de grandeza
data_viz = scale(data_viz)

#Treinando o t-SNE
model_viz_2d = TSNE(perplexity=100, verbose=1, random_state=1234)
data_proj_2d = model_viz_2d.fit_transform(data_viz)


[t-SNE] Computing 301 nearest neighbors...
[t-SNE] Indexed 2584 samples in 0.008s...
[t-SNE] Computed neighbors for 2584 samples in 0.244s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2584
[t-SNE] Computed conditional probabilities for sample 2000 / 2584
[t-SNE] Computed conditional probabilities for sample 2584 / 2584
[t-SNE] Mean sigma: 0.324904
[t-SNE] KL divergence after 250 iterations with early exaggeration: 58.498825
[t-SNE] KL divergence after 1000 iterations: 0.570423


In [None]:
y_prob_viz = modelo_basico.predict_proba(data_processed)[:,1]

data_plot_2d = pd.concat([
                          pd.DataFrame(data_proj_2d, columns=["X", "Y"]),
                          pd.DataFrame(data_viz, columns=colunas_viz),
                          pd.DataFrame(raw_data["class"].apply(str)),
                          pd.DataFrame(y_prob_viz*5, columns=["prob"])], axis=1)

In [None]:
data_plot_2d

Unnamed: 0,X,Y,genergy,energy,gdenergy,gpuls,gdpuls,nbumps,nbumps2,nbumps3,class,prob
0,37.141788,16.654100,-0.327561,-0.243327,-1.050711,-0.872073,-1.211459,-0.629984,-0.502253,-0.510423,0,1.002571
1,-5.267431,-34.980053,-0.329568,-0.145512,-1.025805,-0.898738,-1.322299,0.102965,-0.502253,0.789019,0,1.900295
2,38.729160,17.055405,-0.358675,-0.243327,-1.162786,-0.904071,-1.306465,-0.629984,-0.502253,-0.510423,0,0.978873
3,0.585536,-27.362318,-0.268038,-0.096605,-0.440526,-0.653424,0.561974,0.102965,-0.502253,0.789019,0,2.101058
4,31.977600,14.336788,-0.338645,-0.243327,-0.938636,-0.856075,-0.894775,-0.629984,-0.502253,-0.510423,0,0.958129
...,...,...,...,...,...,...,...,...,...,...,...,...
2579,-15.435766,9.559308,-0.038544,-0.243327,5.225478,0.438047,2.319573,-0.629984,-0.502253,-0.510423,0,2.579686
2580,-12.254629,8.823896,-0.210042,-0.243327,2.498325,0.029190,1.797044,-0.629984,-0.502253,-0.510423,0,2.512377
2581,-8.114150,9.312488,-0.276155,-0.243327,1.103616,0.002525,1.702038,-0.629984,-0.502253,-0.510423,0,2.163791
2582,11.924936,13.560340,-0.323415,-0.243327,-0.129207,-0.385000,-0.039727,-0.629984,-0.502253,-0.510423,0,1.246010


In [None]:
data_plot_2d = data_plot_2d.melt(id_vars=["X", "Y", "class"],
                                 var_name="Feature", value_name="Value")
data_plot_2d

Unnamed: 0,X,Y,class,Feature,Value
0,37.141788,16.654100,0,genergy,-0.327561
1,-5.267431,-34.980053,0,genergy,-0.329568
2,38.729160,17.055405,0,genergy,-0.358675
3,0.585536,-27.362318,0,genergy,-0.268038
4,31.977600,14.336788,0,genergy,-0.338645
...,...,...,...,...,...
23251,-15.435766,9.559308,0,prob,2.579686
23252,-12.254629,8.823896,0,prob,2.512377
23253,-8.114150,9.312488,0,prob,2.163791
23254,11.924936,13.560340,0,prob,1.246010


In [None]:
data_plot_2d.loc[data_plot_2d["Value"] > 5, "Value"] = 5  

In [None]:
px.scatter(data_plot_2d, x="X", y="Y", 
                   symbol="class", 
                   symbol_map = {"0": "x-thin-open", "1": "circle"},
                   size=data_plot_2d["class"].apply(int)*10+1, size_max=8,
                   color="Value", opacity=0.5,
                   width=750, height=750,
                   animation_frame="Feature",
                   color_continuous_scale=["blue", "yellow", "red"])