# Logistic Regression models are for CLASSIFICATION not Regression
## Data preparation and cleaning steps are the same for regression or classification models
## Classification used to predict categorical data (what is this image of, what is the type)
## Classification models plot the probability Y of X
## p(yi) = 1/1+ e-(A+Bxi)
#### A is the intercept; B is the regression coefficient; e is the constant 2.71828 (Euler's number)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
housing_data = pd.read_csv('Data_Files/housing.csv')
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
420,-122.26,37.89,52,3706,531.0,1205,504,6.6828,370900,NEAR BAY
2047,-119.72,36.72,15,1713,246.0,766,232,6.8162,127200,INLAND
1730,-122.35,37.98,34,3756,726.0,2237,686,3.7562,132900,NEAR BAY
2276,-119.79,36.8,27,2462,484.0,852,449,3.32,124700,INLAND
1847,-122.28,37.91,41,3009,482.0,1053,490,5.828,324400,NEAR BAY


In [3]:
housing_data.shape

(2382, 10)

In [4]:
housing_data = housing_data.dropna()

In [5]:
housing_data.shape

(2365, 10)

In [6]:
housing_data.loc[housing_data['median_house_value']==500001].count()

longitude             28
latitude              28
housing_median_age    28
total_rooms           28
total_bedrooms        28
population            28
households            28
median_income         28
median_house_value    28
ocean_proximity       28
dtype: int64

In [7]:
housing_data = housing_data.drop(housing_data.loc[housing_data['median_house_value']== 500001].index)

In [8]:
housing_data.shape

(2337, 10)

In [9]:
housing_data['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [10]:
#pd.get_dummies converts to one hot form
housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity'])

In [11]:
housing_data.sample(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
2089,-119.79,36.76,52,2408,498.0,1361,465,2.1055,61300,0,1,0,0,0
1399,-122.08,37.96,21,9135,1534.0,3748,1502,6.0859,266000,0,0,0,1,0
1922,-120.81,38.89,17,1438,324.0,675,268,2.9444,119300,0,1,0,0,0
1504,-122.03,37.94,21,5541,776.0,2214,737,5.5777,279300,0,0,0,1,0
648,-122.13,37.72,25,1134,153.0,340,171,6.5095,371200,0,0,0,1,0
596,-122.08,37.7,32,2718,447.0,1156,410,5.2497,259300,0,0,0,1,0
1175,-121.57,39.48,15,202,54.0,145,40,0.8252,42500,0,1,0,0,0
2241,-119.81,36.83,10,5780,922.0,2712,883,5.6445,135500,0,1,0,0,0
1968,-120.7,38.69,13,4492,821.0,2093,734,4.0709,151700,0,1,0,0,0
1140,-121.59,39.74,17,1646,330.0,750,344,2.3798,83800,0,1,0,0,0


In [12]:
housing_data.shape

(2337, 14)

In [13]:
median = housing_data['median_house_value'].median()
median

156600.0

In [14]:
housing_data['above_median'] = (housing_data['median_house_value'] - median) > 0

In [15]:
housing_data.sample(15)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
1356,-121.92,38.02,16,1840,355.0,1288,338,4.2067,125000,0,1,0,0,0,False
2186,-120.0,36.7,33,1902,370.0,1168,358,2.6852,70800,0,1,0,0,0,False
1205,-121.71,39.42,21,1432,284.0,862,275,2.2813,57600,0,1,0,0,0,False
769,-122.09,37.65,35,1184,200.0,572,194,4.7143,193800,0,0,0,1,0,True
1600,-122.12,37.89,30,3227,733.0,1260,684,4.125,257100,0,0,0,1,0,True
950,-121.93,37.72,26,2806,459.0,1453,444,4.9107,213800,1,0,0,0,0,True
2063,-119.73,36.59,31,1551,296.0,1058,287,3.3438,92600,0,1,0,0,0,False
1647,-121.82,37.81,12,4711,659.0,2089,621,8.3209,485400,0,1,0,0,0,True
506,-122.29,37.84,35,1872,419.0,1017,414,2.2106,132500,0,0,0,1,0,False
2365,-119.5,36.74,20,1089,208.0,531,212,4.5938,106900,0,1,0,0,0,False


In [16]:
#Setting up features and Y-values (labels) of our classification model
X = housing_data.drop(['median_house_value', 'above_median'], axis=1)
Y = housing_data['above_median'] #Y value just becomes True or False

In [17]:
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2) #80/20 train test split

In [19]:
x_train.shape, x_test.shape

((1869, 13), (468, 13))

In [20]:
y_train.shape, y_test.shape

((1869,), (468,))

In [22]:
#Ready to perform logisitic regression to classify our data
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(solver='liblinear').fit(x_train,y_train) #Fit function kick starts the training process. Liblinear solver is good for small datasets and binary classification

In [23]:
print("Training_score:", logistic_model.score(x_train,y_train))

Training_score: 0.8651685393258427


### For Classification models/problems, the default score is determined by accuracy
#### 86.5% of the predictions were correct on TRAINING data. How does it perform on TEST data?

In [25]:
y_pred = logistic_model.predict(x_test)

In [28]:
df_pred_actual = pd.DataFrame({'Predicted':y_pred, 'Actual':y_test})
df_pred_actual.head(15)

Unnamed: 0,Predicted,Actual
1305,False,False
31,False,False
195,False,False
24,False,False
1734,True,False
601,True,True
1462,False,True
1864,True,False
2157,False,False
999,True,True


In [29]:
from sklearn.metrics import accuracy_score
print("Testing_score:", accuracy_score(y_test,y_pred))

Testing_score: 0.8568376068376068


### Testing Score of 85.65% only slightly lower than Training Score of 96.52%