# Logistic Regression Models in Python

In [1]:
# Classification vs Regression

# - Classification involves a categoical outcome variable (ex. trying to predict some 0 or 1)

# - Regression involves a continuous outcome variable     (ex. trying to predict inches of rainfall)

# - "logistic regression" is used for modeling categorical data

In [2]:
# We still need to do the thing of breaking all data down into training and test data

# adult Data

In [13]:
# - popular census data from UCI ML repository

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


# Data Exploration

In [14]:
df = pd.read_csv("adult.csv", skipinitialspace = True)    # i.e. ignore the white space character after the comma

df.columns = df.columns.str.replace(" ", "")    # clearing white space in the column names

In [15]:
print(df.shape)
df.head()

(32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [16]:
# Dummy code outcome variable 

df["income_gt_50"] = [1 if x == ">50K" else 0 for x in df["income"]]

np.mean(df["income_gt_50"])     # proportion that is greater than 50k

0.2408095574460244

In [17]:
xvars = ["age", "education_num", "capital_gain", "hours_per_week"]

X = df.loc[:, xvars].values     # get X values (i.e. predictors/features) and casting into a NumPy array
y = df.loc[:, "income_gt_50"].values   # get y values (i.e. outcome/target variable)

# Train/Test Split

In [18]:
# SPlit training/test data at random

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Fitting Logistic Regression Model

In [19]:
mod = LogisticRegression()  # create model object

mod.fit(X_train, y_train)   # fit model



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
print(mod.coef_)  # regression coefficients for age, eductation number, capital gain, and hours per week respectively

[[0.03105533 0.2435741  0.00030158 0.02867336]]


In [22]:
# Use fitted model to make predictions 

y_pred = mod.predict(X_test)

In [26]:
print("F1 Score:", metrics.f1_score(y_test, y_pred))           # a measure of performance for a logistic regression
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))     # proportion of time that you made the correct prediction

F1 Score: 0.42530282637954236
Accuracy: 0.8033164440350069


In [27]:
# accuracy not always a great metric; depending on distribution of outcome variable, accuracy may be high but not indicative of a good model

# for example, we can tell by looking at the data that people make over 50k 24% of the time. Thus, even if we didn't have a model and just guessed 0 for every person

# (i.e. saying each person makes under 50k), our accuracy would be 76%. Therefore, our model does only slightly better than pure chance.

# if we did more feature engineering, or added more variables, our accuracy may increase.