In [2]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Importing built-in dataset
from sklearn.datasets import fetch_california_housing

# Renaming the dataset
housing = fetch_california_housing()

In [4]:
housing.data

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [5]:
# Checking the datset
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [6]:
import pandas as pd

housing_features = pd.DataFrame(housing.data, columns = housing.feature_names)

housing_target = pd.DataFrame(housing.target, columns= housing.target_names)


In [7]:
housing_features.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [8]:
housing_target.head()

Unnamed: 0,MedHouseVal
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    housing_features, housing_target, test_size=0.2, random_state=77
)

X_test.shape

(4128, 8)

# BASIC MODEL

In [10]:
%pip install linear_model

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement linear_model (from versions: none)
ERROR: No matching distribution found for linear_model


In [12]:
from sklearn.linear_model import LinearRegression

# Loading a blank linear regression model
model = LinearRegression()

# Fitting the modl
model.fit(X_train, y_train)

# Predicting using the model
predictions = model.predict(X_test)

predictions

array([[ 2.90809618],
       [ 2.14979624],
       [-0.14219725],
       ...,
       [ 2.09127012],
       [ 3.81936894],
       [ 2.85069818]])

In [13]:
from sklearn.metrics import r2_score

print(r2_score(y_test, predictions))

0.5977433933712177


## Basic Model with Feature Selection


In [14]:
# Reduction of Variables
# 1-2 omtiiotopm / redundant variables

housing_features_fs1 = housing_features.drop(['Latitude', 'Longitude'], axis=1)

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    housing_features_fs1, housing_target, test_size=0.2, random_state=77
)

model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(r2_score(y_test, predictions))

0.5359111365234248


In [17]:
# Reduction of Variables
# Random Testing
housing_features_fs2 = housing_features.drop(['Population'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    housing_features_fs2, housing_target, test_size=0.2, random_state=77
)

model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(r2_score(y_test, predictions))

0.5977294182527497


In [19]:
# Reduction of Variables
# Using test

# Scale first for heatmap

dataset = pd.concat([housing_features, housing_target], axis=1)
dataset_scaled = (dataset - dataset.min()) / (dataset.max() - dataset.min())
dataset_scaled

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,0.539668,0.784314,0.043512,0.020469,0.008941,0.001499,0.567481,0.211155,0.902266
1,0.538027,0.392157,0.038224,0.018929,0.067210,0.001141,0.565356,0.212151,0.708247
2,0.466028,1.000000,0.052756,0.021940,0.013818,0.001698,0.564293,0.210159,0.695051
3,0.354699,1.000000,0.035241,0.021929,0.015555,0.001493,0.564293,0.209163,0.672783
4,0.230776,1.000000,0.038534,0.022166,0.015752,0.001198,0.564293,0.209163,0.674638
...,...,...,...,...,...,...,...,...,...
20635,0.073130,0.470588,0.029769,0.023715,0.023599,0.001503,0.737513,0.324701,0.130105
20636,0.141853,0.333333,0.037344,0.029124,0.009894,0.001956,0.738576,0.312749,0.128043
20637,0.082764,0.313725,0.030904,0.023323,0.028140,0.001314,0.732200,0.311753,0.159383
20638,0.094295,0.333333,0.031783,0.024859,0.020684,0.001152,0.732200,0.301793,0.143713


In [20]:
%pip install seaborn

Collecting seaborn
  Using cached seaborn-0.12.2-py3-none-any.whl (293 kB)
Collecting matplotlib!=3.6.1,>=3.1 (from seaborn)
  Obtaining dependency information for matplotlib!=3.6.1,>=3.1 from https://files.pythonhosted.org/packages/01/50/0d8d8f044e2a0d8151e9ed59fe50924e9e697ba43a8b12d5ff9b45adb871/matplotlib-3.8.0-cp39-cp39-win_amd64.whl.metadata
  Using cached matplotlib-3.8.0-cp39-cp39-win_amd64.whl.metadata (5.9 kB)
Collecting contourpy>=1.0.1 (from matplotlib!=3.6.1,>=3.1->seaborn)
  Obtaining dependency information for contourpy>=1.0.1 from https://files.pythonhosted.org/packages/87/2b/9b49451f7412cc1a79198e94a771a4e52d65c479aae610b1161c0290ef2c/contourpy-1.1.1-cp39-cp39-win_amd64.whl.metadata
  Using cached contourpy-1.1.1-cp39-cp39-win_amd64.whl.metadata (5.9 kB)
Collecting cycler>=0.10 (from matplotlib!=3.6.1,>=3.1->seaborn)
  Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fonttools>=4.22.0 (from matplotlib!=3.6.1,>=3.1->seaborn)
  Obtaining dependency informa

In [25]:
import seaborn as sns

sns.heatmap(dataset_scaled.corr(), cmap="virdis", annot=True)

KeyError: "'virdis' is not a known colormap name"

In [27]:
housing_features_fs3 = housing_features.drop(['AveBedrms'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    housing_features_fs3, housing_target, test_size=0.2, random_state=77
)

model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
#print(r2_score(y_test, predictions))

0.5850029859403384


# Basic Model with Feature Engineering

In [28]:
# Handlin of Missing Data

# Chained methods
housing_features.isnull().values.any()

False

In [None]:
# Creating missing values for example
sample = {'MedInc': 2.03, }

# Hyperparameter Tuning

In [29]:
from sklearn.datasets import load_iris  
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [30]:
iris_features = pd.DataFrame(iris.data, columns = iris.feature_names)
iris_target = pd.DataFrame(iris.target, columns = ['Class'])
iris_target

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0
...,...
145,2
146,2
147,2
148,2


In [31]:
X_train, X_test, y_train, y_test = train_test_split(iris_features, iris_target, test_size=0.2, random_state=77)

DTmodel = DecisionTreeClassifier(random_state=77, criterion='entropy', max_depth=2, max_features=0.5, min_samples_leaf=5, min_samples_split=3)
DTmodel.fit(X_train, y_train)

predictions = DTmodel.predict(X_test)

In [32]:
predictions

array([1, 2, 1, 1, 0, 1, 2, 1, 0, 1, 0, 1, 0, 0, 0, 2, 2, 2, 0, 1, 0, 2,
       2, 2, 1, 1, 2, 0, 2, 1])

In [33]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, predictions))

0.7666666666666667


## Hyperparameter Tuning with GridSearch CV

In [36]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'max_depth':[1,2,3,4,5,10,20,30,40,50,60,70,80,90,100,None],
    'criterion': ['gini', 'entropy'],
    'max_features': [0.3, 0.5, 0.7, 0.9, None],
    'min_samples_leaf': [1,2,3,5,6,7,8,9,10,15],
    'min_samples_split': [2,3,4,5,6,10]
}

# Creating a blank model
grid_search_model = DecisionTreeClassifier(random_state=77)

# Gridsearch on the model
grid_search_model = GridSearchCV(
    grid_search_model,
    parameters,
    cv=6,
    scoring='accuracy', n_jobs=-1)

grid_results = grid_search_model.fit(X_train, y_train)

# Finding the best hyperparameters
print('Best params: ', grid_results.best_params_)

Best params:  {'criterion': 'gini', 'max_depth': 4, 'max_features': 0.9, 'min_samples_leaf': 1, 'min_samples_split': 2}


## Final Model

In [37]:
hyper_grid_model = DecisionTreeClassifier(random_state=77, criterion= 'gini', max_depth = 4, max_features=0.9, min_samples_leaf= 1, min_samples_split= 2)

hyper_grid_model.fit(X_train, y_train)


predictions = hyper_grid_model.predict(X_test)

print(accuracy_score(y_test, predictions))

0.9
