# Submission 05 -- Exploratory Data Analysis PIPES

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import train_test_split, cross_val_score

In [13]:
df = pd.read_csv('../datasets/train_clean.csv')

df.shape

(2051, 263)

## First attempt: Throw all columns into a LASSO regression

In [14]:
X = df.drop(columns='saleprice')
y = df['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y)

### Scale my feature dataframe

In [31]:
l_alpha = np.logspace(-3, 10, 100)

pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lasso', LassoCV(alphas=l_alpha, cv=5, max_iter=5000))
])

In [32]:
pipe.get_params()

{'memory': None,
 'steps': [('ss', StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('lasso',
   LassoCV(alphas=array([1.00000e-03, 1.35305e-03, ..., 7.39072e+09, 1.00000e+10]),
       copy_X=True, cv=5, eps=0.001, fit_intercept=True, max_iter=5000,
       n_alphas=100, n_jobs=None, normalize=False, positive=False,
       precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
       verbose=False))],
 'ss': StandardScaler(copy=True, with_mean=True, with_std=True),
 'lasso': LassoCV(alphas=array([1.00000e-03, 1.35305e-03, ..., 7.39072e+09, 1.00000e+10]),
     copy_X=True, cv=5, eps=0.001, fit_intercept=True, max_iter=5000,
     n_alphas=100, n_jobs=None, normalize=False, positive=False,
     precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
     verbose=False),
 'ss__copy': True,
 'ss__with_mean': True,
 'ss__with_std': True,
 'lasso__alphas': array([1.00000000e-03, 1.35304777e-03, 1.83073828e-03, 2.47707636e-03,
        3.35160265e-03, 4.5

In [30]:
pipe.fit_transform(X_train, y_train,{
    'ss__X' : X_train,
})

TypeError: fit_transform() takes from 2 to 3 positional arguments but 4 were given

In [16]:
pipe.transform()

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lasso', LassoCV(alphas=array([1.00000e-03, 1.35305e-03, ..., 7.39072e+09, 1.00000e+10]),
    copy_X=True, cv=5, eps=0.001, fit_intercept=True, max_iter=5000,
    n_alphas=100, n_jobs=None, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False))])

In [17]:
pipe.score(X_train, y_train)

  Xt = transform.transform(Xt)


0.9245608970766497

In [18]:
pipe.

{'memory': None,
 'steps': [('ss', StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('lasso',
   LassoCV(alphas=array([1.00000e-03, 1.35305e-03, ..., 7.39072e+09, 1.00000e+10]),
       copy_X=True, cv=5, eps=0.001, fit_intercept=True, max_iter=5000,
       n_alphas=100, n_jobs=None, normalize=False, positive=False,
       precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
       verbose=False))],
 'ss': StandardScaler(copy=True, with_mean=True, with_std=True),
 'lasso': LassoCV(alphas=array([1.00000e-03, 1.35305e-03, ..., 7.39072e+09, 1.00000e+10]),
     copy_X=True, cv=5, eps=0.001, fit_intercept=True, max_iter=5000,
     n_alphas=100, n_jobs=None, normalize=False, positive=False,
     precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
     verbose=False),
 'ss__copy': True,
 'ss__with_mean': True,
 'ss__with_std': True,
 'lasso__alphas': array([1.00000000e-03, 1.35304777e-03, 1.83073828e-03, 2.47707636e-03,
        3.35160265e-03, 4.5

In [None]:
pipe.fit_transform()

In [None]:
ss = StandardScaler()
Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

In [None]:
# Set up a list of Lasso alphas to check.
l_alpha = np.logspace(-3, 10, 100)

# Cross-validate over our list of Lasso alphas.
lasso = LassoCV(alphas=l_alpha, cv=5, max_iter=5000)

# Fit model using best ridge alpha!
lasso.fit(Z_train, y_train)

In [None]:
lasso.alpha_

In [None]:
lasso.score(Z_train, y_train)

In [None]:
lasso.score(Z_test, y_test)

In [None]:
coef_list = pd.Series({col:coef for col,coef in zip(X_train.columns,lasso.coef_)})

In [None]:
coef_list_not_zero = coef_list[round(coef_list,8) > 0.0]

In [None]:
coef_list_not_zero.index

### Create submission: `sub01.csv`
Do all the above for the test set

In [None]:
test = pd.read_csv('../datasets/test_clean.csv')
test.shape

In [None]:
test_trans = ss.transform(test)

In [None]:
test['SalePrice'] = lasso.predict(test_trans)

In [None]:
test[['id','SalePrice']].rename({'id':'Id'},axis=1).to_csv('../submissions/sub_01.csv'
                                                           ,index=False)

## Second Attempt
Using the Lasso model fit to a log transformation of the `saleprice`.

In [None]:
X = df.drop(columns='saleprice')
y = np.log1p(df['saleprice'])

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
ss = StandardScaler()
Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

In [None]:
# Set up a list of Lasso alphas to check.
l_alpha = np.logspace(-3, 10, 100)

# Cross-validate over our list of Lasso alphas.
lasso = LassoCV(alphas=l_alpha, cv=5, max_iter=5000)

# Fit model using best ridge alpha!
lasso.fit(Z_train, y_train)

In [None]:
lasso.alpha_

In [None]:
cross_val_score(lasso, Z_train, y_train).mean()

In [None]:
lasso.score(Z_train, y_train)

In [None]:
lasso.score(Z_test, y_test)

### Create submission: `sub02.csv`
Do all the above for the test set

In [None]:
test = pd.read_csv('../datasets/test_clean.csv')
test.shape

In [None]:
test_trans = ss.transform(test)

In [None]:
test['SalePrice'] = np.expm1(lasso.predict(test_trans))

In [None]:
test[['id','SalePrice']].rename({'id':'Id'},axis=1).to_csv('../submissions/sub_02.csv'
                                                           ,index=False)

## Third Attempt
Using the Lasso coefficients. Narrow in on better features.

In [None]:
# Create a series of column names zipped with their lasso coefficients
coef_list = pd.Series({col:coef for col,coef in zip(X_train.columns,lasso.coef_)})

# Pull the columns that did not have 0 coefficients as my new features.
features = coef_list[round(coef_list,2) > 0.0].index

### Use `PolynomialFeatures()` to create interaction columns.
Using the list of columns from the submission above, we are going to create polynomial columns and see how they perform.

In [None]:
from sklearn.preprocessing import PolynomialFeatures

### Import Train, Polynomial Transform, Scale and Model

In [None]:
train = pd.read_csv('../datasets/train_clean.csv')
X = train[features]
y = np.log1p(train['saleprice'])

#### Create Polynomial Features

In [None]:
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

#### Train Test Split the Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_poly, y)

#### Scale the data

In [None]:
ss = StandardScaler()
Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

#### Instantiate and fit model.

In [None]:
lasso = LassoCV()

In [None]:
cross_val_score(lasso, Z_train, y_train, cv=5).mean()

In [None]:
lasso.fit(Z_train, y_train)

In [None]:
lasso.score(Z_train, y_train)

In [None]:
lasso.score(Z_test, y_test)

### Create submission: `sub03.csv`
Do all the above for the test set

In [None]:
# Pull clean csv file.
test = pd.read_csv('../datasets/test_clean.csv')

# Pull our features from the testing dataframe.
X = test[features]

# Transform our data
X_poly = poly.transform(X)

# Scale our data
Z_test = ss.transform(X_poly)

# Predict our prices, and convert into interpretable values.
test['SalePrice'] = np.expm1(lasso.predict(Z_test))

# Create csv
test[['id','SalePrice']].rename({'id':'Id'},axis=1).to_csv('../submissions/sub_03.csv'
                                                           ,index=False)

In [None]:
# Create a series of column names zipped with their lasso coefficients
coef_list = pd.Series({col:coef for col,coef in zip(poly.get_feature_names(features),lasso.coef_)})

# Pull the columns that did not have 0 coefficients as my new features.
features = coef_list[round(coef_list,2) > 0.0]
# features #.sort_values(ascending=False)

In [None]:
features[features > 0.01].sort_values(ascending=False)

## Fourth Attempt: Leaving the Lasso
Our model is performing, but our lasso is adding an error feature that may be making it worse. Using the data we have from the previous models, we will attempt some more transformations to improve out model, but leave the Lasso model behind.

In [None]:
train, X, y = fix_data_get_X_y('../datasets/data_fillna.csv',features)
X.head(), y.head()

In [None]:
lr = LinearRegression()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
cross_val_score(lr, X_train, y_train, cv=5).mean()

In [None]:
lr.fit(X_train,y_train)

In [None]:
lr.score(X_train,y_train)

In [None]:
lr.score(X_test,y_test)

In [None]:
def create_submission(model=lr,feat=features,test_name='sub_04'):
    
    df, X_test, y_test = fix_data_get_X_y('./datasets/test_fillna.csv',feat,model=model,test=True)
    print(X_test.columns)
    df['SalePrice'] = y_test
    submission = df[['Id','SalePrice']]
    
    submission.to_csv(f'./datasets/{test_name}.csv',index=False)
    return submission.shape

In [None]:
create_submission(lr,features,'sub_04')

In [None]:
df[['Overall Qual', 'Exter Qual','Gr Liv Area', 'Kitchen Qual','Bsmt Qual']].dtypes

In [None]:
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
poly = PolynomialFeatures(degree=3)

df.select_dtypes(include='number').columns

features = ['Overall Qual', 'Exter Qual', 'Gr Liv Area', 'Kitchen Qual','Bsmt Qual']
X_train = poly.fit_transform(df[features])

print(X_train)

data = pd.DataFrame(X_train,columns=poly.get_feature_names(features))

data['SalePrice'] = df['SalePrice']

plt.figure(figsize=(7,100))
sns.heatmap(data.corr()[['SalePrice']].sort_values(by='SalePrice',ascending=False)
            ,annot=True
            ,cmap='viridis')



In [None]:
X_train

In [None]:
X_train.shape

In [None]:
df['Exter Qual'].value_counts()