In [1]:
import pandas as pd
import numpy as np

In [2]:
# load data for classification

titanic_train = pd.read_csv('../../data/titanic_train.csv')
titanic_test = pd.read_csv('../../data/titanic_test.csv')
pd.set_option('max_columns', None)
pd.set_option('max_rows', 5)

In [3]:
# load data for regression

melb = pd.read_csv('../../data/melb_data.csv')

hp_train = pd.read_csv('../../data/house-prices-advanced-regression-techniques/train.csv')
hp_test = pd.read_csv('../../data/house-prices-advanced-regression-techniques/test.csv')

pd.set_option('max_columns', None)
pd.set_option('max_rows', 5)

---
# **Preprocessing**
---

In [8]:
from sklearn.model_selection import train_test_split

y_melb = melb.Price
X_melb = melb.drop(['Price'], axis=1)

In [10]:
X_melb = melb.select_dtypes(exclude=['object'])

In [11]:
X_melb

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,2,1480000.0,2.5,3067.0,2.0,1.0,1.0,202.0,,,-37.79960,144.99840,4019.0
1,2,1035000.0,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,-37.80790,144.99340,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13578,4,2500000.0,6.8,3016.0,4.0,1.0,5.0,866.0,157.0,1920.0,-37.85908,144.89299,6380.0
13579,4,1285000.0,6.3,3013.0,4.0,1.0,1.0,362.0,112.0,1920.0,-37.81188,144.88449,6543.0


---
## **Missing values**

three approaches:
1. drop columns with missing values
2. imputation
3. an extension to imputation

### **drop columns with missing values**

In [5]:
melb_dmv = melb.dropna(axis=0)

### **imputation**

In [7]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()

In [14]:
pd.DataFrame(my_imputer.fit_transform(X_melb))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,2.0,1480000.0,2.5,3067.0,2.0,1.0,1.0,202.0,151.96765,1964.684217,-37.79960,144.99840,4019.0
1,2.0,1035000.0,2.5,3067.0,2.0,1.0,0.0,156.0,79.00000,1900.000000,-37.80790,144.99340,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13578,4.0,2500000.0,6.8,3016.0,4.0,1.0,5.0,866.0,157.00000,1920.000000,-37.85908,144.89299,6380.0
13579,4.0,1285000.0,6.3,3013.0,4.0,1.0,1.0,362.0,112.00000,1920.000000,-37.81188,144.88449,6543.0


In [15]:
help(my_imputer.fit_transform)

Help on method fit_transform in module sklearn.base:

fit_transform(X, y=None, **fit_params) method of sklearn.impute._base.SimpleImputer instance
    Fit to data, then transform it.
    
    Fits transformer to X and y with optional parameters fit_params
    and returns a transformed version of X.
    
    Parameters
    ----------
    X : numpy array of shape [n_samples, n_features]
        Training set.
    
    y : numpy array of shape [n_samples]
        Target values.
    
    **fit_params : dict
        Additional fit parameters.
    
    Returns
    -------
    X_new : numpy array of shape [n_samples, n_features_new]
        Transformed array.



### **an extension to imputation**

---
## **Categorical values**

three approaches:

1. drop categorical variables
2. label encoding
3. one-hot encoding

### **drop categorical variables**

### **label encoding**

### **one-hot encoding**

# **Classification**
---

## Decision Tree Classifier
## Nearest Neighbors Method
## Logistic Regression
## Perceptron
## Ridge Classifier
## SGD Classifier

# **Regression**
---

In [11]:
0.85**4 * 0.15**3

0.0017617710937499994

## **Decision Tree Regressor**

In [8]:
# drop all rows with NA variables
melb = melb.dropna(axis=0)

In [10]:
y = melb.Price
features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
X = melb[features]

In [12]:
from sklearn.model_selection import train_test_split

train_X, valid_X, train_y, valid_y = train_test_split(X, y)

In [13]:
from sklearn.tree import DecisionTreeRegressor

# Define model
melb_model = DecisionTreeRegressor()

# fit model
melb_model.fit(train_X, train_y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [16]:
from sklearn.metrics import mean_absolute_error

pred = melb_model.predict(valid_X)
mean_absolute_error(valid_y, pred)

247056.3034215623

## **Linear Regression**

In [17]:
from sklearn.linear_model import LinearRegression

In [97]:
model_LR = LinearRegression()

In [19]:
model_LR.fit(train_X, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [20]:
pred_LR = model_LR.predict(valid_X)
mean_absolute_error(valid_y, pred_LR)

304712.5332312765

## **Ridge**

In [21]:
from sklearn.linear_model import Ridge

model_R = Ridge()

In [22]:
model_R.fit(train_X, train_y)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [23]:
pred_R = model_R.predict(valid_X)
mean_absolute_error(valid_y, pred_R)

304795.2030165817

## **SGD Regressor**

In [195]:
from sklearn.linear_model import SGDRegressor

model_SGD = SGDRegressor(alpha=0.0001, max_iter=10000, loss='huber')

In [196]:
model_SGD.fit(train_X, train_y)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='huber', max_iter=10000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [197]:
pred_SGD = model_SGD.predict(valid_X)
mean_absolute_error(valid_y, pred_SGD)

377898.50926290144

# **Metrics**

In [11]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y)

In [None]:
train_X

In [None]:
train_y

In [12]:
from sklearn.tree import DecisionTreeRegressor

# Define
melb_model = DecisionTreeRegressor()

# fit model
melb_model.fit(train_X, train_y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [None]:
from sklearn.metrics import mean_absolute_error

pred = melb_model.predict(val_X)
mean_absolute_error(val_y, pred)

In [None]:
dir()

In [13]:
from basic import get_mae_ensemble

In [None]:
good_mln = 1
min_my_mae = 10**100
for max_leaf_nodes in range(100, 600):
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    if my_mae < min_my_mae:
        good_mln = max_leaf_nodes
        min_my_mae = my_mae
    # print('Max leaf nodes: %d \t\t Mean Absolute Error: %d' %(max_leaf_nodes, my_mae))
    
print(good_mln, min_my_mae)

In [16]:
good_n = 1
min_my_mae_ensemble = 10**100
for n_estimators in range(80, 120):
    my_mae = get_mae_ensemble(n_estimators, train_X, val_X, train_y, val_y)
    if my_mae < min_my_mae_ensemble:
        good_n = n_estimators
        min_my_mae_ensemble = my_mae
    print('Max leaf nodes: %d \t\t Mean Absolute Error: %d' %(n_estimators, my_mae))
    
print(good_n, min_my_mae_ensemble)

Max leaf nodes: 80 		 Mean Absolute Error: 182561
Max leaf nodes: 81 		 Mean Absolute Error: 181543
Max leaf nodes: 82 		 Mean Absolute Error: 180642
Max leaf nodes: 83 		 Mean Absolute Error: 181976
Max leaf nodes: 84 		 Mean Absolute Error: 179568
Max leaf nodes: 85 		 Mean Absolute Error: 180730
Max leaf nodes: 86 		 Mean Absolute Error: 180650
Max leaf nodes: 87 		 Mean Absolute Error: 182023
Max leaf nodes: 88 		 Mean Absolute Error: 181346
Max leaf nodes: 89 		 Mean Absolute Error: 180210
Max leaf nodes: 90 		 Mean Absolute Error: 180528
Max leaf nodes: 91 		 Mean Absolute Error: 180160
Max leaf nodes: 92 		 Mean Absolute Error: 181462
Max leaf nodes: 93 		 Mean Absolute Error: 181119
Max leaf nodes: 94 		 Mean Absolute Error: 179397
Max leaf nodes: 95 		 Mean Absolute Error: 182538
Max leaf nodes: 96 		 Mean Absolute Error: 180202
Max leaf nodes: 97 		 Mean Absolute Error: 179243
Max leaf nodes: 98 		 Mean Absolute Error: 180389
Max leaf nodes: 99 		 Mean Absolute Error: 180847
