# Multiple Linear Regression

In [None]:
import pandas as pd
import numpy as np

In [None]:
df_beer = pd.read_csv('bases/consumo_cerveja.csv',nrows=365,delimiter=';')
df_beer.head()

In [None]:
from sklearn.model_selection import train_test_split #para criar conjunto de treino e teste
#alterando nome das colunas para facilitar manipulação
df_beer = df_beer.rename(columns={'Temperatura Media (C)':'meanTemp','Temperatura Minima (C)':'minTemp','Temperatura Maxima (C)':'maxTemp',
                       'Precipitacao (mm)':'precip','Final de Semana':'weekend','Consumo de cerveja (litros)':'consumo'})

#selecionando as colunas serem usadas como features para treinar o modelo
#selecionando a coluna meta, ou seja, aquela que eu quero predizer
feature_col = ['maxTemp','precip','weekend']
meta_col = ['consumo']

X = df_beer[feature_col].values
y = df_beer[meta_col].values.ravel()



In [None]:

split_test_size = 0.30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, random_state=42)

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [None]:
#separando os dados em conjunto de treino e teste
#treinando o modelo
mlr_skl = linear_model.LinearRegression()
mlr_skl.fit(X_train, y_train.ravel()) # flatten arrays

In [None]:
mlr_skl.coef_  #valores finais de theta

In [None]:
y_pred = mlr_skl.predict(X_test) # predizando
print(mean_squared_error(y_test,y_pred)) # MSE


In [None]:
#testando com novas amostras
pred_value = [[35,0,0]]
mlr_skl.predict(pred_value)

# Regression Trees

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re
import numpy as np

In [None]:
df = pd.read_csv('bases/auto-mpg.csv')

In [None]:
df.head()

In [None]:
df['horsepower'].unique()

In [None]:
# Eliminando valores '?'
# re.sub('?', np.nan, df['horsepower'])
# df['horsepower'].fillna(df['horsepower'].median(), inplace=True)
df = df[df['horsepower']!='?']

In [None]:
features = ['horsepower', 'weight']
# Garantindo que estão no formato correto
for ft in features:
    df[ft] = pd.to_numeric(df[ft])
# Construindo matriz de features e vetor de true labels
X = df[features]
y = df['mpg'].values.tolist()

In [None]:
# Instanciando
reg = DecisionTreeRegressor(max_depth=2, min_samples_split=3)

In [None]:
# Fit
reg.fit(X,y)

In [None]:
# Visualizando
print(tree.export_text(reg))

In [None]:
#predict
reg.predict([['300','4000']])

# Prunning

In [None]:
# vamos usar um dataset diferente para entender o processo de prunning
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)

In [None]:
#separando o arquivo em conjunto de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
#Calculando Cost Complecity Pruning
# Aqui, vou usar Decision Tree Classifier, voltado para classificação, apenas para fins didáticos. O processo é o mesmo para regressão
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
path = clf.cost_complexity_pruning_path(X_train,y_train)

In [None]:
path

In [None]:
# obtendo valores de alpha e impureza
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
# plot
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

In [None]:
# Treinando a árvore com diferentes valores de alpha
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)
print(
    "Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
        clfs[-1].tree_.node_count, ccp_alphas[-1]
    )
)

In [None]:
# eliminando o último elemento de clfs e ccp_alphas por que representa a árvore com apenas uma folha
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

In [None]:
clfs

In [None]:
#Obtendo os scores para cada valor de aplha
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]

In [None]:
#plotando
fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker="o", label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()