In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline



In [2]:
training_df =pd.read_csv('housing-classification-iter6.csv')

In [3]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   TotalBsmtSF    1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   Fireplaces     1460 non-null   int64  
 5   PoolArea       1460 non-null   int64  
 6   GarageCars     1460 non-null   int64  
 7   WoodDeckSF     1460 non-null   int64  
 8   ScreenPorch    1460 non-null   int64  
 9   Expensive      1460 non-null   int64  
 10  MSZoning       1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Heating        1460 non-null   object 
 13  Street         1460 non-null   object 
 14  CentralAir     1460 non-null   object 
 15  Foundation     1460 non-null   object 
 16  ExterQual      1460 non-null   object 
 17  ExterCond      1460 non-null   object 
 18  BsmtQual

In [4]:
training_df['Expensive'].value_counts()

0    1243
1     217
Name: Expensive, dtype: int64

In [5]:
# Setting X and y
#X: columns that help us make a prediction.
#y: the column that we want to predict.

#Define X and y
X = training_df
y = X.pop("Expensive")

In [6]:
#Data Splitting (Train - Test)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [7]:
# Feature Engineering
X.loc.columns = X.columns.str[0]

In [8]:
#Categorical encoding - "Automated" approach (Using Pipelines)
#Creating the "numeric pipe" and the "categoric pipe"

# select categorical and numerical column names
from sklearn.preprocessing import OneHotEncoder
X_cat_columns = X.select_dtypes(exclude="number").copy().columns
X_num_columns = X.select_dtypes(include="number").copy().columns

# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="median"))
 
 # create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(handle_unknown='ignore')
)

In [9]:
#Using ColumnTransformer a pipeline with 2 branches (the preprocessor)
#We simply tell the pipeline the following:

#One branch, called "num_pipe", will apply the steps in the numeric_pipe to the columns named in X_num_columns
#The second branch, called "cat_pipe", will apply the steps in the categoric_pipe to the columns named in X_cat_columns


from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num_columns),
        ("cat_pipe", categoric_pipe, X_cat_columns),
    ]
)

In [10]:
#Creating the full_pipeline (preprocessor + Decision Tree)
#Pipelines are modular. The preprocessor we created above with the ColumnTransformer 
#can become now a step in a new pipeline, that we'll call full_piepline and will include,
#as a last step, a Decision Tree model:

# full_pipeline = make_pipeline(preprocessor, DecisionTreeClassifier())

full_pipeline = make_pipeline(preprocessor, 
                              DecisionTreeClassifier(max_depth=3, min_samples_leaf=6))

In [11]:
from sklearn import set_config
set_config(display="diagram") #or 'text'

In [12]:
set_config

<function sklearn._config.set_config(assume_finite=None, working_memory=None, print_changed_only=None, display=None)>

In [13]:
full_pipeline.fit(X_train, y_train)

In [14]:
full_pipeline.predict(X_train)

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [15]:
full_pipeline.score(X_train,y_train)

0.9443493150684932

In [16]:
full_pipeline.score(X_test,y_test)

0.9315068493150684

In [17]:
#Using GridsearchCV to find the best parameters

from sklearn.model_selection import GridSearchCV

full_pipeline = make_pipeline(preprocessor, 
                              DecisionTreeClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "decisiontreeclassifier__max_depth": range(4, 14, 4),
    "decisiontreeclassifier__min_samples_leaf": range(1, 12, 5)
}

search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1,
                      scoring='accuracy')

search.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [18]:
search.best_params_

{'columntransformer__num_pipe__simpleimputer__strategy': 'mean',
 'decisiontreeclassifier__max_depth': 8,
 'decisiontreeclassifier__min_samples_leaf': 1}

In [19]:
search.best_score_

0.9280583984446645

In [20]:
#Prediction_section


In [21]:
predict_df=pd.read_csv('test.csv')


In [22]:
prediction_y = search.predict(predict_df)

In [23]:
prediction_y

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [24]:
prediction_df = pd.DataFrame(prediction_y,columns=['Expensive'])

In [25]:
prediction_df.head(1000)

Unnamed: 0,Expensive
0,0
1,0
2,0
3,0
4,0
...,...
995,0
996,0
997,0
998,0


In [26]:
#prediction_df.reset_index(drop=True, inplace=True)
prediction_df = prediction_df.rename_axis('Id').reset_index()


In [27]:
#prediction_df['id']= prediction_df

In [28]:
prediction_df

Unnamed: 0,Id,Expensive
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
1454,1454,0
1455,1455,0
1456,1456,1
1457,1457,0


In [29]:
prediction_df.to_csv('osazee.csv')

In [30]:
prediction_df

Unnamed: 0,Id,Expensive
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
1454,1454,0
1455,1455,0
1456,1456,1
1457,1457,0
