# Submission 01 -- Exploratory Data Analysis

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import train_test_split, cross_val_score

In [4]:
df = pd.read_csv('../datasets/train_clean.csv')

df.shape

(2051, 263)

## First attempt: Throw all columns into a LASSO regression

Our first attempt will be to throw everything into a Lasso Regression. Hopefully, we will also see which parameters have the greatest effect on the target. 

In [5]:
X = df.drop(columns='saleprice')
y = df['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y)

### Scale my feature dataframe
We are going to scale our data before fitting our LASSO model.

In [6]:
ss = StandardScaler()
Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  This is separate from the ipykernel package so we can avoid doing imports until


### Fit our Lasso Model 

In [7]:
# Set up a list of Lasso alphas to check.
l_alpha = np.logspace(-3, 10, 100)

# Cross-validate over our list of Lasso alphas.
lasso = LassoCV(alphas=l_alpha, cv=5, max_iter=5000)

# Fit model using best ridge alpha!
lasso.fit(Z_train, y_train)

LassoCV(alphas=array([1.00000e-03, 1.35305e-03, ..., 7.39072e+09, 1.00000e+10]),
    copy_X=True, cv=5, eps=0.001, fit_intercept=True, max_iter=5000,
    n_alphas=100, n_jobs=None, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [8]:
lasso.alpha_

811.130830789689

In [9]:
lasso.score(Z_train, y_train)

0.9224362435835176

In [10]:
lasso.score(Z_test, y_test)

0.9194965504111114

### Filter out irrelavant features
We can see that our model has a lot of error due to variance, since our training score is so much higher than our test score.

In a future iteration, we can take out any features that the LASSO regression has given zero coefficients to simplify our model.

### Create submission: `sub01.csv`
Do all the above for the test set to generate predictions.

In [11]:
test = pd.read_csv('../datasets/test_clean.csv')
test.shape

(879, 262)

In [12]:
# Scale the testing data using the standard scaler 
# fit from our training data.
test_trans = ss.transform(test)

  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
# Generate our predictions based of the LassoCV
test['SalePrice'] = lasso.predict(test_trans)

In [14]:
# Rename the id column and store submission.
test[['id','SalePrice']].rename({'id':'Id'},axis=1).to_csv('../submissions/sub_01.csv'
                                                           ,index=False)

### Kaggle Result
- This model scored a 22618.10395 on the Kaggle Leaderboard.