In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv('train.csv')
train.head()

test = pd.read_csv('test.csv')
test.head()

train.rename(columns= {col: col.lower().replace(' ','_') for col in train.columns}, inplace=True)
test.rename(columns= {col: col.lower().replace(' ','_') for col in test.columns}, inplace=True)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,,,,0,3,2010,WD,Normal,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,,,,0,4,2009,WD,Normal,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,,,,0,1,2010,WD,Abnorml,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,,,,0,3,2010,WD,Normal,138500


In [5]:
train['sale_condition'].value_counts(normalize=True)

# Abnormal is JUST 'Abnorml', everything else is Normal

encoded_conditon = {
    'Abnorml': 1,
    'Partial': 0,
    'AdjLand': 0,
    'Family': 0,
    'Normal': 0,
    'Alloca': 0
}

train['sale_condition'] = train['sale_condition'].map(encoded_conditon)

train.head()

train.sale_condition.value_counts(normalize=True)

# Our baseline is about 83.6%

nonnull_numeric_shared = (train._get_numeric_data().isnull().sum() == 0) & (test._get_numeric_data().isnull().sum() == 0)
nonnull_numeric_shared

features = ['1st_flr_sf', '2nd_flr_sf', '3ssn_porch', 'bedroom_abvgr', 'enclosed_porch', 'fireplaces', 'full_bath', 'gr_liv_area', 'half_bath', 'id', 'kitchen_abvgr', 'lot_area', 'low_qual_fin_sf', 'misc_val', 'mo_sold', 'ms_subclass', 'open_porch_sf', 'overall_cond', 'overall_qual', 'pid', 'pool_area', 'screen_porch', 'totrms_abvgrd', 'wood_deck_sf', 'year_built', 'year_remod/add', 'yr_sold']

# LOOK AT THESE FOR THE ONES YOU CARE ABOUT

train.corr()['sale_condition'].abs().sort_values(ascending=False)[0:7]

train.bsmt_unf_sf.isnull().sum()

train.loc[(train['bsmt_unf_sf'] == np.nan), :]

train.shape

# Let's just drop it! YOLO!

train.dropna(subset = ['bsmt_unf_sf'], how = 'any', inplace=True)

train.shape

train.garage_yr_blt.isnull().sum()

# Let's get really drop happy

train.dropna(subset = ['garage_yr_blt'], how = 'any', inplace=True)

train.shape

train.yr_sold.isnull().sum()

corr_features = ['bsmt_unf_sf', 'overall_cond', 'garage_yr_blt', 'total_bsmt_sf', 'yr_sold']

Normal     0.826914
Partial    0.079961
Abnorml    0.064359
Family     0.014139
Alloca     0.009264
AdjLand    0.005363
Name: sale_condition, dtype: float64

# Model Time:

In [25]:
# Alternate reality aka v2

X = train[corr_features]
y = train['sale_condition']

# First reality aka v1

X = train[features]
y = train['sale_condition']

X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, random_state = 69, stratify=y)

ss = StandardScaler()
X_ts = ss.fit_transform(X_train) #ONLY FIT TO TRAINING DATA
X_hs = ss.transform(X_holdout) #TRANSFORM BOTH

knn = KNeighborsClassifier()

cross_val_score(knn, X_ts, y_train).mean()

# So like about the same as the baseline...

params = {
    'n_neighbors': range(2,30),
    'p': [1,2],
    'weights': ['uniform', 'distance']
}
gs = GridSearchCV(knn, params)
gs.fit(X_ts, y_train)

print (gs.best_params_)
print (gs.best_score_)

# Still like the same as baseline...

# Scoring it on the holdout set to make sure we're not overfitting
# Compare to the cross_val_score

gs.score(X_hs, y_holdout)

# Slightly better?

test.head(3)

# corr_features

# test.shape

# test.garage_yr_blt.isnull().sum()

# median_garage_yr_blt = train['garage_yr_blt'].median()

# test['garage_yr_blt'] = test['garage_yr_blt'].fillna(median_garage_yr_blt)

# X_test = test[features]
# X_test_scaled = ss.transform(X_test)
# X_test_scaled

test['sale_condition'] = gs.predict(X_test_scaled)

test.head()

test = test.rename(columns={'id': 'Id', 'sale_condition': 'Sale Condition'})

test.head()

# test[['Id', 'Sale Condition']].to_csv('p2_class_1.csv', index=False)

# test[['Id', 'Sale Condition']].to_csv('p2_class_2.csv', index=False)

test[['Id', 'Sale Condition']].to_csv('p2_class_3.csv', index=False)

submission = pd.read_csv('p2_class_3.csv')
submission.head(2)
submission['Sale Condition'].value_counts()

# Let's give it a shot with Logistic Regression

In [24]:
X = train[features]
y = train['sale_condition']

In [26]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, random_state = 69, stratify=y)

In [27]:
ss = StandardScaler()
X_ts = ss.fit_transform(X_train) #ONLY FIT TO TRAINING DATA
X_hs = ss.transform(X_holdout) #TRANSFORM BOTH

In [28]:
logreg = LogisticRegression()

cross_val_score(logreg, X_ts, y_train).mean()

0.9421514415931922

In [29]:
# Ugh about the same

In [31]:
params = {
    'C': [1, 10, 100, 1000],
    'penalty': ['l1', 'l2']
}
gs = GridSearchCV(logreg, params)
gs.fit(X_ts, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1, 10, 100, 1000], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [33]:
print (gs.best_params_)
print (gs.best_score_)

{'C': 1, 'penalty': 'l1'}
0.942148760331


In [34]:
gs.score(X_hs, y_holdout)

0.94421487603305787

In [35]:
X_test = test[features]
X_test_scaled = ss.transform(X_test)
X_test_scaled

array([[-0.68140572,  1.60405663, -0.10080925, ..., -2.21609559,
        -1.70103413, -1.36634201],
       [ 1.95901846, -0.77382207, -0.10080925, ...,  0.10086261,
        -0.39331098, -1.36634201],
       [-1.2897754 ,  1.16578095, -0.10080925, ...,  1.10372511,
         1.01128055, -1.36634201],
       ..., 
       [ 0.07406976, -0.77382207, -0.10080925, ..., -0.21037058,
        -0.8292187 ,  0.16387673],
       [-0.79111172, -0.77382207, -0.10080925, ..., -0.10662619,
        -0.68391613, -0.60123264],
       [-0.6390193 , -0.77382207, -0.10080925, ..., -0.65992964,
        -1.45886318, -0.60123264]])

In [36]:
test['sale_condition'] = gs.predict(X_test_scaled)

In [37]:
test = test.rename(columns={'id': 'Id', 'sale_condition': 'Sale Condition'})
test.head()

Unnamed: 0,Id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,Sale Condition
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,,,,0,4,2006,WD,0
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,,,,0,8,2006,WD,0
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,,,,0,9,2006,New,0
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,,,,0,7,2007,WD,0
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,185,0,,,,0,7,2009,WD,0


In [38]:
test[['Id', 'Sale Condition']].to_csv('p2_logreg_1.csv', index=False)

In [39]:
submission = pd.read_csv('p2_logreg_1.csv')
submission.head(2)
submission['Sale Condition'].value_counts()

0    879
Name: Sale Condition, dtype: int64

In [40]:
# Same exact prediction as above... not worth submitting.