## Import data

In [34]:
import pandas as pd
import numpy as np
title= pd.read_csv('Title_Defect_Dataset.csv')
title.head()

Unnamed: 0,Property_ID,StatusOwnerOccupiedFlag,TaxAssessedValueTotal,TaxAssessedImprovementsPerc,TaxMarketValueTotal,TaxMarketImprovementsPerc,AreaBuilding,BedroomsCount,StoriesCount,DeckFlag,...,X_PERC_CDPD60,X_PERC_CForeclosure,X_PERC_CDerogatory,X_PERC_CBankruptcy,X_PERC_MDPD60,X_PERC_MDerogatory,X_PERC_MBankruptcy,X_Bankruptcy_Index,X_Risk_Score,X_Vantage_Score
0,244558691,,18500.0,0.0,18450.0,,1716,4,2.0,0.0,...,11.0,11.0,11.0,0.0,11.0,11.0,0.0,287.0,727.0,817.0
1,168532803,1.0,110000.0,90.0,183333.0,,1192,3,1.0,0.0,...,19.0,0.0,16.0,0.0,3.0,3.0,0.0,295.0,744.0,825.0
2,170243135,1.0,190000.0,81.0,316667.0,,3484,3,2.0,0.0,...,19.0,0.0,16.0,0.0,3.0,3.0,0.0,295.0,744.0,825.0
3,50276190,1.0,191000.0,91.0,318333.0,,2400,4,1.0,0.0,...,19.0,0.0,16.0,0.0,3.0,3.0,0.0,295.0,744.0,825.0
4,38057038,1.0,110000.0,75.0,183333.0,,1200,3,1.0,0.0,...,19.0,0.0,16.0,0.0,3.0,3.0,0.0,295.0,744.0,825.0


In [35]:
data = title[['StatusOwnerOccupiedFlag', 'TaxAssessedValueTotal',
       'TaxMarketValueTotal', 'dr_Title_Defect_Ind', 'dr_MktValueToIncome',
       'dr_Condo_Ind', 'dr_Ownership_Period', 'dr_House_Age',
       'dr_Value_Change', 'X_PERC_CDPD60', 'X_PERC_CForeclosure',
       'X_PERC_CDerogatory', 'X_PERC_CBankruptcy', 'X_PERC_MDPD60','X_PERC_MDerogatory', 'X_PERC_MBankruptcy',
       'X_Bankruptcy_Index', 'X_Risk_Score', 'X_Vantage_Score']]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500250 entries, 0 to 500249
Data columns (total 19 columns):
StatusOwnerOccupiedFlag    334401 non-null float64
TaxAssessedValueTotal      500173 non-null float64
TaxMarketValueTotal        500173 non-null float64
dr_Title_Defect_Ind        500250 non-null int64
dr_MktValueToIncome        496926 non-null float64
dr_Condo_Ind               500250 non-null int64
dr_Ownership_Period        361899 non-null float64
dr_House_Age               447048 non-null float64
dr_Value_Change            500250 non-null object
X_PERC_CDPD60              454631 non-null float64
X_PERC_CForeclosure        454631 non-null float64
X_PERC_CDerogatory         454631 non-null float64
X_PERC_CBankruptcy         454631 non-null float64
X_PERC_MDPD60              454631 non-null float64
X_PERC_MDerogatory         454631 non-null float64
X_PERC_MBankruptcy         454631 non-null float64
X_Bankruptcy_Index         454631 non-null float64
X_Risk_Score               

## Data Cleaning

In [36]:
#Replacing INC, DEC, NC with integers
data = data.replace(to_replace=['INC', 'DEC', 'NC'],
           value= [1,0,-1])



In [37]:
data.isna().sum()

StatusOwnerOccupiedFlag    165849
TaxAssessedValueTotal          77
TaxMarketValueTotal            77
dr_Title_Defect_Ind             0
dr_MktValueToIncome          3324
dr_Condo_Ind                    0
dr_Ownership_Period        138351
dr_House_Age                53202
dr_Value_Change                 0
X_PERC_CDPD60               45619
X_PERC_CForeclosure         45619
X_PERC_CDerogatory          45619
X_PERC_CBankruptcy          45619
X_PERC_MDPD60               45619
X_PERC_MDerogatory          45619
X_PERC_MBankruptcy          45619
X_Bankruptcy_Index          45619
X_Risk_Score                45619
X_Vantage_Score             45619
dtype: int64

In [38]:
#Only dropping rows with missing values in the following columns
cleaned_data = data.dropna(subset=['StatusOwnerOccupiedFlag','dr_Ownership_Period','dr_Condo_Ind','X_PERC_CForeclosure'])
cleaned_data.isna().sum()

StatusOwnerOccupiedFlag       0
TaxAssessedValueTotal         2
TaxMarketValueTotal           2
dr_Title_Defect_Ind           0
dr_MktValueToIncome           2
dr_Condo_Ind                  0
dr_Ownership_Period           0
dr_House_Age               2347
dr_Value_Change               0
X_PERC_CDPD60                 0
X_PERC_CForeclosure           0
X_PERC_CDerogatory            0
X_PERC_CBankruptcy            0
X_PERC_MDPD60                 0
X_PERC_MDerogatory            0
X_PERC_MBankruptcy            0
X_Bankruptcy_Index            0
X_Risk_Score                  0
X_Vantage_Score               0
dtype: int64

In [39]:
#Drop all rows with a missing value in all columns
revised_data= data.dropna()
revised_data

Unnamed: 0,StatusOwnerOccupiedFlag,TaxAssessedValueTotal,TaxMarketValueTotal,dr_Title_Defect_Ind,dr_MktValueToIncome,dr_Condo_Ind,dr_Ownership_Period,dr_House_Age,dr_Value_Change,X_PERC_CDPD60,X_PERC_CForeclosure,X_PERC_CDerogatory,X_PERC_CBankruptcy,X_PERC_MDPD60,X_PERC_MDerogatory,X_PERC_MBankruptcy,X_Bankruptcy_Index,X_Risk_Score,X_Vantage_Score
2,1.0,190000.0,316667.0,0,3.98,0,19.0,18.0,-1,19.0,0.0,16.0,0.0,3.0,3.0,0.0,295.0,744.0,825.0
3,1.0,191000.0,318333.0,0,4.00,0,0.0,39.0,0,19.0,0.0,16.0,0.0,3.0,3.0,0.0,295.0,744.0,825.0
4,1.0,110000.0,183333.0,0,2.30,0,14.0,12.0,-1,19.0,0.0,16.0,0.0,3.0,3.0,0.0,295.0,744.0,825.0
7,1.0,142400.0,167529.0,0,2.10,0,11.0,62.0,-1,28.0,0.0,28.0,0.0,7.0,7.0,0.0,264.0,714.0,783.0
9,1.0,124400.0,146353.0,0,1.84,0,6.0,84.0,-1,28.0,0.0,28.0,0.0,7.0,7.0,0.0,264.0,714.0,783.0
12,1.0,88600.0,147667.0,0,1.85,0,16.0,45.0,-1,19.0,0.0,16.0,0.0,3.0,3.0,0.0,295.0,744.0,825.0
13,1.0,80400.0,134000.0,0,1.68,0,8.0,43.0,-1,19.0,0.0,16.0,0.0,3.0,3.0,0.0,295.0,744.0,825.0
20,1.0,116500.0,137059.0,0,1.72,0,12.0,78.0,-1,28.0,0.0,28.0,0.0,7.0,7.0,0.0,264.0,714.0,783.0
23,1.0,113500.0,189167.0,0,2.37,0,10.0,53.0,-1,19.0,0.0,16.0,0.0,3.0,3.0,0.0,295.0,744.0,825.0
32,1.0,110500.0,184167.0,0,2.31,0,9.0,88.0,-1,19.0,0.0,16.0,0.0,3.0,3.0,0.0,295.0,744.0,825.0


## Feature Selection

Lasso Method

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
X = revised_data.loc[:, revised_data.columns != 'dr_Title_Defect_Ind']
y = revised_data['dr_Title_Defect_Ind']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [41]:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [42]:
sel_ = SelectFromModel(Lasso(alpha=0))
sel_.fit(scaler.transform(X_train), y_train)
sel_.get_support()

  self.estimator_.fit(X, y, **fit_params)
  positive)
  positive)


array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

In [43]:
selected_feat = X_train.columns[(sel_.get_support())]
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

total features: 18
selected features: 17
features with coefficients shrank to zero: 1


In [44]:
selected_feat

Index(['TaxAssessedValueTotal', 'TaxMarketValueTotal', 'dr_MktValueToIncome',
       'dr_Condo_Ind', 'dr_Ownership_Period', 'dr_House_Age',
       'dr_Value_Change', 'X_PERC_CDPD60', 'X_PERC_CForeclosure',
       'X_PERC_CDerogatory', 'X_PERC_CBankruptcy', 'X_PERC_MDPD60',
       'X_PERC_MDerogatory', 'X_PERC_MBankruptcy', 'X_Bankruptcy_Index',
       'X_Risk_Score', 'X_Vantage_Score'],
      dtype='object')

Ridge Method

In [95]:
sel_r = SelectFromModel(Ridge(alpha=0))
sel_r.fit(scaler.transform(X_train), y_train)
sel_r.get_support()

array([False,  True, False, False,  True,  True,  True, False,  True,
       False, False, False, False, False, False, False,  True,  True])

In [98]:
selected_feat = X_train.columns[(sel_r.get_support())]
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_r.estimator_.coef_ == 0)))

total features: 18
selected features: 7
features with coefficients shrank to zero: 1


In [97]:
selected_feat

Index(['TaxAssessedValueTotal', 'dr_Condo_Ind', 'dr_Ownership_Period',
       'dr_House_Age', 'X_PERC_CDPD60', 'X_Risk_Score', 'X_Vantage_Score'],
      dtype='object')

## Models

#### KNN

In [46]:
#Create x and y for training
y = revised_data['dr_Title_Defect_Ind']
X = revised_data[['TaxAssessedValueTotal', 'TaxMarketValueTotal', 'dr_MktValueToIncome',
       'dr_Condo_Ind', 'dr_Ownership_Period', 'dr_House_Age',
       'dr_Value_Change', 'X_PERC_CDPD60', 'X_PERC_CForeclosure',
       'X_PERC_CDerogatory', 'X_PERC_CBankruptcy', 'X_PERC_MDPD60',
       'X_PERC_MDerogatory', 'X_PERC_MBankruptcy', 'X_Bankruptcy_Index',
       'X_Risk_Score', 'X_Vantage_Score']]

X.head()

Unnamed: 0,TaxAssessedValueTotal,TaxMarketValueTotal,dr_MktValueToIncome,dr_Condo_Ind,dr_Ownership_Period,dr_House_Age,dr_Value_Change,X_PERC_CDPD60,X_PERC_CForeclosure,X_PERC_CDerogatory,X_PERC_CBankruptcy,X_PERC_MDPD60,X_PERC_MDerogatory,X_PERC_MBankruptcy,X_Bankruptcy_Index,X_Risk_Score,X_Vantage_Score
2,190000.0,316667.0,3.98,0,19.0,18.0,-1,19.0,0.0,16.0,0.0,3.0,3.0,0.0,295.0,744.0,825.0
3,191000.0,318333.0,4.0,0,0.0,39.0,0,19.0,0.0,16.0,0.0,3.0,3.0,0.0,295.0,744.0,825.0
4,110000.0,183333.0,2.3,0,14.0,12.0,-1,19.0,0.0,16.0,0.0,3.0,3.0,0.0,295.0,744.0,825.0
7,142400.0,167529.0,2.1,0,11.0,62.0,-1,28.0,0.0,28.0,0.0,7.0,7.0,0.0,264.0,714.0,783.0
9,124400.0,146353.0,1.84,0,6.0,84.0,-1,28.0,0.0,28.0,0.0,7.0,7.0,0.0,264.0,714.0,783.0


In [50]:
#Train test split and gridserach to tune model
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

param_grid = {'n_neighbors': np.arange(1, 15, 2)} 

grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=10)

grid.fit(X_train, y_train)

#extract best score and parameter by calling objects "best_score_" and "best_params_"
print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))
print("test-set score: {:.3f}".format(grid.score(X_test, y_test)))

best mean cross-validation score: 0.998
best parameters: {'n_neighbors': 7}
test-set score: 0.998
