In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import pandas as pd

data=pd.read_csv('AmesHousing.txt', sep="\t")
train=data.iloc[:1460]
test=data.iloc[1460:]

data.info()
target = 'SalePrice'

### Model fitting criteria:
* Residual Sum Of Squares

To find the optimal parameters for a linear regression model, we want to optimize the model's residual sum of squares (or RSS). If you call, residual (often referred to as errors) describes the difference between the predicted values for the target column () and the true values ():

In [None]:
## Use Scikit learn
from sklearn.linear_model import LinearRegression

reg=LinearRegression()
#reg.fit(train[['Garage Area']],train["SalePrice"])
reg.fit(train[['Gr Liv Area']],train["SalePrice"])
#reg.fit(train[['Overall Cond']],train["SalePrice"])

a1= reg.coef_# first coefficient, slope
a0= reg.intercept_# intercept

In [None]:
## Test RMSE
import numpy as np
from sklearn.metrics import mean_squared_error
lr = LinearRegression()
lr.fit(train[['Gr Liv Area']], train['SalePrice'])

predictions=lr.predict(train[['Gr Liv Area']])
train_rmse=mean_squared_error(train['SalePrice'],predictions)**.5

predictions=lr.predict(test[['Gr Liv Area']])
test_rmse=mean_squared_error(test['SalePrice'],predictions)**.5

In [None]:
### More than one feature:
cols = ['Overall Cond', 'Gr Liv Area']

import numpy as np
from sklearn.metrics import mean_squared_error
lr = LinearRegression()
lr.fit(train[cols], train['SalePrice'])

predictions=lr.predict(train[cols])
train_rmse_2=mean_squared_error(train['SalePrice'],predictions)**.5

predictions=lr.predict(test[cols])
test_rmse_2=mean_squared_error(test['SalePrice'],predictions)**.5

### Feature Selection

In [None]:
import pandas as pd
data = pd.read_csv('AmesHousing.txt', delimiter="\t")
train = data[0:1460]
test = data[1460:]

numerical_train = train.select_dtypes(include=['int', 'float'])
numerical_train=numerical_train.drop(['PID','Year Built','Year Remod/Add','Garage Yr Blt','Mo Sold','Yr Sold'],axis=1)
null_series=numerical_train.isnull().sum()
full_cols_series=null_series[null_series==0]
print(full_cols_series)



In [None]:
train_subset = train[full_cols_series.index]
correlation=train_subset.corr()
sorted_corrs=correlation.SalePrice.abs().sort_values()
print(sorted_corrs)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

strong_corrs=sorted_corrs[sorted_corrs>0.3]
corrmat = train_subset[strong_corrs.index].corr()
sns.heatmap(corrmat)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

final_corr_cols = strong_corrs.drop(['Garage Cars', 'TotRms AbvGrd'])
features = final_corr_cols.drop(['SalePrice']).index
target = 'SalePrice'
clean_test=test[features].dropna()

lr=LinearRegression()
lr.fit(train[features],train['SalePrice'])

train_predictions = lr.predict(train[features])
test_predictions = lr.predict(clean_test[features])

train_mse=mean_squared_error(train_predictions,train[target])
test_mse=mean_squared_error(test_predictions,clean_test[target])

train_rmse=np.sqrt(train_mse)
test_rmse=np.sqrt(test_mse)

print(train_rmse)
print(test_rmse)

In [None]:
rescaled=train[features].apply(lambda x: (x-x.min())/(x.max()-x.min()))
print(rescaled)

sorted_vars=rescaled.var().sort_values()
print(sorted_vars)



In [None]:
features=features.drop('Open Porch SF')
clean_test = test[final_corr_cols.index].dropna()

lr=LinearRegression()
lr.fit(train[features],train['SalePrice'])

train_predictions=lr.predict(train[features])
train_mse=mean_squared_error(train_predictions,train['SalePrice'])

test_predictions=lr.predict(clean_test[features])
test_mse=mean_squared_error(test_predictions,clean_test['SalePrice'])                           
train_rmse_2=np.sqrt(train_mse)
test_rmse_2=np.sqrt(test_mse)
                             
print(train_rmse_2)
print(test_rmse_2)


### Notes
* Feature selection based on correlation
* Co-linearity between features
* Use correlation matrix heat-map
* Rescaling
* 

### Gradient Descent
#### Select initial values for the parameter: 
* repeat until convergence (usually implemented with a max number of iterations):
* calculate the error (MSE) of model that uses current parameter value: 
* calculate the derivative of the error (MSE) at the current parameter value: 
* update the parameter value by subtracting the derivative times a constant (, called the learning rate): 


#### For every iteration of gradient descent:
* this derivative is computed using the current  value
* the derivative is multiplied by the learning rate (): 
* the result is subtracted from the current parameter value and assigned as the new parameter value: 


In [None]:
def derivative(a1, xi_list, yi_list):
    
    result=2/len(xi_list)*(xi_list*(a1*xi_list-yi_list)).sum()
    return result

def gradient_descent(xi_list, yi_list, max_iterations, alpha, a1_initial):
    a1_list = [a1_initial]

    for i in range(0, max_iterations):
        a1 = a1_list[i]
        deriv = derivative(a1, xi_list, yi_list)
        a1_new = a1 - alpha*deriv
        a1_list.append(a1_new)
    return(a1_list)

# Uncomment when ready.
param_iterations = gradient_descent(train['Gr Liv Area'], train['SalePrice'], 20, .0000003, 150)
final_param = param_iterations[-1]

In [None]:
### Two parameters (a0 and a1)
def a1_derivative(a0, a1, xi_list, yi_list):
    len_data = len(xi_list)
    error = 0
    for i in range(0, len_data):
        error += xi_list[i]*(a0 + a1*xi_list[i] - yi_list[i])
    deriv = 2*error/len_data
    return deriv

def a0_derivative(a0, a1, xi_list, yi_list):
    len_data = len(xi_list)
    error = 0
    for i in range(0, len_data):
        error += (a0 + a1*xi_list[i] - yi_list[i])
    deriv = 2*error/len_data
    return deriv

def gradient_descent(xi_list, yi_list, max_iterations, alpha, a1_initial, a0_initial):
    a1_list = [a1_initial]
    a0_list = [a0_initial]

    for i in range(0, max_iterations):
        a1 = a1_list[i]
        a0 = a0_list[i]
        
        a1_deriv = a1_derivative(a0, a1, xi_list, yi_list)
        a0_deriv = a0_derivative(a0, a1, xi_list, yi_list)
        
        a1_new = a1 - alpha*a1_deriv
        a0_new = a0 - alpha*a0_deriv
        
        a1_list.append(a1_new)
        a0_list.append(a0_new)
    return(a0_list, a1_list)

# Uncomment when ready.
a0_params, a1_params = gradient_descent(train['Gr Liv Area'], train['SalePrice'], 20, .0000003, 150, 1000)

### Ordinary Linear Squares (OLS)
* Use linear algebra instead of gradient descent to find the findal coefficients

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

data = pd.read_csv('AmesHousing.txt', delimiter="\t")
train = data[0:1460]
test = data[1460:]

features = ['Wood Deck SF', 'Fireplaces', 'Full Bath', '1st Flr SF', 'Garage Area',
       'Gr Liv Area', 'Overall Qual']

X=train[features]
y=train['SalePrice']

first_term = np.linalg.inv(
        np.dot(np.transpose(X), X)
)
second_term = np.dot(
        np.transpose(X),
        y
    )
a = np.dot(first_term, second_term)
print(a)

#### Notes:
* Derivation of OLS fomula: https://eli.thegreenplace.net/2015/the-normal-equation-and-matrix-calculus/


### Processing and Transforming features: >> feature Engineering
* transform categorical colums to the correct type: 
* dummy encoding: separate categorical columns into multiple column with dummy numerics
* Having some domain knowledge can help with determining an acceptable cutoff value. >> and feature engineering
* Two ways to deal with missing data: (1) Remove (2) Impute

In [None]:
import pandas as pd

data = pd.read_csv('AmesHousing.txt', delimiter="\t")
train = data[0:1460]
test = data[1460:]

train_null_counts = train.isnull().sum()
print(train_null_counts)

df_no_mv=train[train_null_counts[train_null_counts==0].index]
df_no_mv.columns

In [None]:
text_cols = df_no_mv.select_dtypes(include=['object']).columns

for col in text_cols:
    #print(col+":", len(train[col].unique()))
    train[col]=train[col].astype('category')
    
train['Utilities'].cat.codes
print(train['Utilities'].cat.codes.value_counts())

In [None]:
dummy_cols = pd.DataFrame()
for col in text_cols:
    col_dummies = pd.get_dummies(train[col])
    train = pd.concat([train, col_dummies], axis=1)
    del train[col]

In [None]:
train['years_until_remod']=train['Year Remod/Add']-train['Year Built']

In [None]:
import pandas as pd

data = pd.read_csv('AmesHousing.txt', delimiter="\t")
train = data[0:1460]
test = data[1460:]

train_null_counts = train.isnull().sum()
df_missing_values=train[train_null_counts[(train_null_counts>0) & (train_null_counts<584)].index]

print(df_missing_values.isnull().sum())
print(df_missing_values.dtypes)

In [None]:
# Impute with column mean
float_cols = df_missing_values.select_dtypes(include=['float'])

float_cols = float_cols.fillna(float_cols.mean())

### Tips
* As we mentioned earlier, succeeding in predictive modeling (and competitions like Kaggle) is highly dependent on the quality of features the model has.