# Exercicio 2 - Regressão Logística
MO 444 - Aprendizado de Maquina (Prof. Jacques Wainer)<br>
Edgar Kenji Tanaka <br>
RA 023577 <br>


## General code & Preprocessing

In [47]:
import pandas as pd
import numpy as np
from sklearn import linear_model, preprocessing
from sklearn.decomposition import PCA

# parameters
split_index = 3133
threshold = 13

# Pre-processing
def preprocess(df):    
    # convert feature from numerical to categorical
    y = np.where(df.ix[:,8] > threshold, 1,0)

    # drop target column
    x = df.drop(df.columns[[8]], axis=1)

    # one hot encode gender feature
    x = pd.get_dummies(x)    
    
    # split dataset into train and test
    train_x = x.ix[:(split_index - 1),]
    test_x = x.ix[split_index:,]
    train_y = y[:split_index] 
    test_y = y[split_index:]
    
    return train_x, test_x, train_y, test_y

# Total correct predictions divided by total number of samples
def accuracy(predicted, y):    
    return np.sum(predicted == y) / y.shape[0]

df = pd.read_csv('abalone.csv', header=None)
train_x, test_x, train_y, test_y = preprocess(df)

## 2. Sem regularização

In [48]:
# 2. train model C=1000000
logistic = linear_model.LogisticRegression(random_state=1, C=1000000)
logistic.fit(train_x, train_y)
predicted = logistic.predict(test_x)
print("2. No regularization: accuracy = ", "%.3f" % accuracy(predicted,test_y))

2. No regularization: accuracy =  0.897


## 3. Com regularização

In [49]:
# 3. train model C=1
logistic = linear_model.LogisticRegression(random_state=1, C=1)
logistic.fit(train_x, train_y)
predicted = logistic.predict(test_x)
print("3. With regularization C=1: accuracy = ", "%.3f" % accuracy(predicted,test_y))

3. With regularization C=1: accuracy =  0.892


## 4. Estandarização + sem regularização
A standardirazacao é feita apenas no conjunto de treino e não no de teste, como instruído em aula.

In [50]:
# 4. with standardization 
scaler = preprocessing.StandardScaler()
scaler.fit(train_x)
train_x_scaled = scaler.transform(train_x)
test_x_scaled = scaler.transform(test_x)
logistic = linear_model.LogisticRegression(random_state=1, C=1000000)
logistic.fit(train_x_scaled, train_y)
predicted = logistic.predict(test_x_scaled)
print("4. Standardized no regularization: accuracy = ", "%.3f" % accuracy(predicted,test_y))

4. Standardized no regularization: accuracy =  0.897


## 5. Aplicando PCA
Descobrimos que são necessários 3 componentes para se obter uma variancia de pelo menos 90%.

In [51]:
# 5. PCA
# 3 components -> 0.990992575678
# 2 components -> 0.847501406213
pca = PCA(n_components=2)
pca.fit(train_x)
print("With 2 components, variance:", np.sum(pca.explained_variance_ratio_))
pca = PCA(n_components=3)
pca.fit(train_x)
print("With 3 components, variance:", np.sum(pca.explained_variance_ratio_))
train_x_pca = pca.transform(train_x)
test_x_pca = pca.transform(test_x)

With 2 components, variance: 0.846865964225
With 3 components, variance: 0.990993418446


## 6. PCA + sem regularização

In [52]:
# 6. PCA without regularization
logistic = linear_model.LogisticRegression(random_state=1, C=1000000)
logistic.fit(train_x_pca, train_y)
predicted = logistic.predict(test_x_pca)
print("6. PCA without regularization: accuracy = ", "%.3f" % accuracy(predicted,test_y))

6. PCA without regularization: accuracy =  0.884


## 7. PCA + com regularização

In [53]:
# 7. PCA with regularization
logistic = linear_model.LogisticRegression(random_state=1, C=1)
logistic.fit(train_x_pca, train_y)
predicted = logistic.predict(test_x_pca)
print("7. PCA with regularization: accuracy = ", "%.3f" % accuracy(predicted,test_y))

7. PCA with regularization: accuracy =  0.884


## 8. Abalone missing data
Sem estandarização, sem PCA, sem regularização

In [54]:
# 8. Leia o arquivo abalone-missing.csv com dados faltantes na 2 a penúltima coluna. 
# Faça o preprocessamento descrito em 1. e impute pela média os valores faltantes. 
# Rode a regressão sem regularização, sem PCA e sem estandardização.
df = pd.read_csv('abalone-missing.csv', header=None)

train_x, test_x, train_y, test_y = preprocess(df)

# fill NAs with average
imputer = preprocessing.Imputer()
imputer.fit(train_x)
train_x = imputer.transform(train_x)
test_x = imputer.transform(test_x)

logistic = linear_model.LogisticRegression(random_state=1, C=1000000)
logistic.fit(train_x, train_y)
predicted = logistic.predict(test_x)
print("8. Accuracy = ", "%.3f" % accuracy(predicted,test_y))

8. Accuracy =  0.887
