In [1]:
import sys

import sklearn
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
train = pd.read_csv('data/house-prices/train.csv', index_col=['Id'])
test = pd.read_csv('data/house-prices/test.csv', index_col=['Id'])

In [3]:
from sklearn.model_selection import train_test_split

train_no_sale_price = train.drop(labels=['SalePrice'], axis=1)
train_sale_price = train['SalePrice']

x_train, x_test, y_train, y_test = train_test_split(train_no_sale_price, 
                                                    train_sale_price, 
                                                    test_size=0.3, 
                                                    random_state=4330)

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
"""
    Several steps are implemented in this method:
    1. Find 'None' values in the dataset and replace them by zeros.
    2. Find all categorical values and encode them using a LabelEncoder
    3. Use a OneHotEncoder to transform the dataset
    4. Return the new dataset
"""
    
def accommodate_strings(data):
    # split columns into categorical_columns and numerical_columns
    numerical_columns = data.describe().columns
    categorical_columns = data.dtypes[data.dtypes == "object"].index
    
    # get all categorical_data
    cat_data = data[categorical_columns]
    cat_data = cat_data.fillna("NAN")
    
    # get all numerical_data
    num_dataframe = data[numerical_columns]
    num_dataframe = num_dataframe.fillna(0)
    

    encoders = { col : LabelEncoder().fit(cat_data[col]) for col in categorical_columns }
    cat_dataframe = pd.DataFrame({col: encoders[col].transform(cat_data[col]) for col in categorical_columns})
    encoder = OneHotEncoder().fit(cat_dataframe)
    cat_dataframe = encoder.transform(cat_dataframe).todense()
    cat_dataframe = pd.DataFrame(cat_dataframe)
    
    # pandas.concatenate BUGGSSSS ????
    new_dataframe = pd.np.concatenate([num_dataframe, cat_dataframe],axis=1)
    return new_dataframe

new_tr = accommodate_strings(train_no_sale_price)
x_tr, x_te, y_tr,y_te = train_test_split(new_tr, train_sale_price, test_size=0.3, random_state=4330)

In [7]:
regressor = LinearRegression().fit(x_tr, y_tr)

In [8]:
regressor.score(x_te, y_te)

0.84688815242815063

In [12]:
from sklearn.linear_model import Ridge

alpha = [0.01, 0.1, 1, 10, 12]

for a in alpha:
    all_data_lr = Ridge(alpha = a).fit(x_tr, y_tr)
    print('alpha:', a)
    print("Train: {0}, Test: {1}".format(all_data_lr.score(x_tr, y_tr), all_data_lr.score(x_te, y_te)))
    print()

alpha: 0.01
Train: 0.9374607177132872, Test: 0.8488214740142501

alpha: 0.1
Train: 0.9364454642419521, Test: 0.85962824282679

alpha: 1
Train: 0.922736065601494, Test: 0.8824267159987822

alpha: 10
Train: 0.8933107135132801, Test: 0.89224047848205

alpha: 12
Train: 0.8907684953353702, Test: 0.8924903654460451

