# Leitura dos dados

In [None]:
import pandas as pd
import io
import requests
url="https://raw.githubusercontent.com/fbarth/ml-espm/master/data/madelon_train.data"
s=requests.get(url).content
madelon_data =pd.read_csv(io.StringIO(s.decode('utf-8')), sep=" ", header=None)

In [None]:
madelon_data = madelon_data.loc[:,0:499]
print(madelon_data.shape)
madelon_data.head()

In [None]:
url="https://raw.githubusercontent.com/fbarth/ml-espm/master/data/madelon_train.labels"
s=requests.get(url).content
madelon_label = pd.read_csv(io.StringIO(s.decode('utf-8')), sep=" ", header=None)

In [None]:
madelon_label.shape
madelon_label = madelon_label.rename(columns = {0: 'label'})
madelon_label['label'].value_counts()

In [None]:
%matplotlib inline
import seaborn as sns
sns.set(style="ticks")
sns.scatterplot(x=0, y=1,hue=madelon_label['label'],data=madelon_data)

# Divisão dos dados

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(madelon_data, madelon_label['label'], test_size=0.2, random_state=4)

# Execução de um modelo baseline

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [None]:
predicted = clf.predict(X_test)
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predicted))
from sklearn.metrics import accuracy_score
accr = accuracy_score(y_test, predicted)
print(accr)

# Execução de uma Random Forest com parâmetros aleatórios

In [None]:
from sklearn.ensemble import RandomForestClassifier
#clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0, oob_score=True)
clf = RandomForestClassifier(n_estimators=500, max_depth=None, random_state=4, oob_score=True, max_features=100)
clf.fit(X_train, y_train)

In [None]:
predicted = clf.predict(X_test)
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predicted))
from sklearn.metrics import accuracy_score
accr = accuracy_score(y_test, predicted)
print(accr)

# Random Forest com GridSearch

In [None]:
%%time
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_features': ['log2', 'sqrt', 50, 100, 200, 300],
    'max_depth' : [5,10,50,100,150,None]
}
rfc=RandomForestClassifier(random_state=4)
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 3, verbose=1, n_jobs=4)
CV_rfc.fit(X_train, y_train)

In [None]:
CV_rfc.best_params_

In [None]:
rfc1=RandomForestClassifier(random_state=4, max_features=100, n_estimators= 1000, max_depth=None)
rfc1.fit(X_train, y_train)

In [None]:
predicted = rfc1.predict(X_test)
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predicted))
from sklearn.metrics import accuracy_score
accr = accuracy_score(y_test, predicted)
print(accr)

# Random Forest com Random Search CV

In [None]:
%%time
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import RandomizedSearchCV
param_grid = { 
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_features': ['log2', 'sqrt', 50, 100, 200, 300],
    'max_depth' : [5,10,50,100,150,None]
}
rfc=RandomForestClassifier(random_state=4)
CV_rfc = RandomizedSearchCV(estimator=rfc, param_distributions = param_grid, cv= 3, verbose=1, n_jobs=-1, n_iter=10)
CV_rfc.fit(X_train, y_train)

In [None]:
CV_rfc.best_params_

In [None]:
rfc2=RandomForestClassifier(random_state=4, max_features=200, n_estimators= 600, max_depth=None)
rfc2.fit(X_train, y_train)

In [None]:
predicted = rfc2.predict(X_test)
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predicted))
from sklearn.metrics import accuracy_score
accr = accuracy_score(y_test, predicted)
print(accr)