# Logistic Regression Project Tutorial



In [102]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [51]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv"
df_raw = pd.read_csv(url, sep = ";")

In [52]:
df_raw.info()
#Info sobre data types y nulls 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [53]:
df_raw.head(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


* Drop duplicate rows:

In [54]:
df_raw = df_raw.drop_duplicates()

* Chequeamos en cada variable de type 'object' si hay 'unknown':

In [55]:
df_raw['job'].value_counts()

admin.           10419
blue-collar       9253
technician        6739
services          3967
management        2924
retired           1718
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
unknown            330
Name: job, dtype: int64

In [56]:
df_raw['marital'].value_counts()

married     24921
single      11564
divorced     4611
unknown        80
Name: marital, dtype: int64

In [57]:
df_raw['education'].value_counts()

university.degree      12164
high.school             9512
basic.9y                6045
professional.course     5240
basic.4y                4176
basic.6y                2291
unknown                 1730
illiterate                18
Name: education, dtype: int64

In [58]:
df_raw['default'].value_counts()

no         32577
unknown     8596
yes            3
Name: default, dtype: int64

In [59]:
df_raw['housing'].value_counts()

yes        21571
no         18615
unknown      990
Name: housing, dtype: int64

In [60]:
df_raw['loan'].value_counts()

no         33938
yes         6248
unknown      990
Name: loan, dtype: int64

In [61]:
df_raw['contact'].value_counts()

cellular     26135
telephone    15041
Name: contact, dtype: int64

In [62]:
df_raw['month'].value_counts()

may    13767
jul     7169
aug     6176
jun     5318
nov     4100
apr     2631
oct      717
sep      570
mar      546
dec      182
Name: month, dtype: int64

In [63]:
df_raw['day_of_week'].value_counts()

thu    8618
mon    8512
wed    8134
tue    8086
fri    7826
Name: day_of_week, dtype: int64

In [64]:
df_raw['poutcome'].value_counts()

nonexistent    35551
failure         4252
success         1373
Name: poutcome, dtype: int64

In [65]:
df_raw['y'].value_counts()

no     36537
yes     4639
Name: y, dtype: int64

* Reemplazamos los unknown por los valores más frecuentes en las variables categóricas:

In [66]:
df_interim = df_raw.copy()

In [67]:
df_interim.loc[df_interim["marital"] == "unknown", "marital"] = "married"
df_interim.loc[df_interim["job"] == "unknown", "job"] = "admin."
df_interim.loc[df_interim["education"] == "unknown", "education"] = "university.degree"
df_interim.loc[df_interim["default"] == "unknown", "default"] = "no"
df_interim.loc[df_interim["housing"] == "unknown", "housing"] = "yes"
df_interim.loc[df_interim["loan"] == "unknown", "loan"] = "no"

In [None]:
#Lo del chunk anterior también se puede hacer con boolean condition
#Just leaving it here for educational purposes :)

#boolean_condition = df.marital == 'Unknown'
#column_name = "marital"
#new_value = "married"

#dfraw.loc[boolean_condition, column_name] = new_value

* Reemplazamos los unknown por la media en las variables numéricas:

In [68]:
df_interim = df_interim.replace('unknown', np.nan) #Reemplazo los uknown por NA para poder usar fillna
for var in df_interim.columns[df_interim.dtypes == 'int64']:
    df_interim[var] = df_interim[var].fillna(df_raw[var].mean())

In [69]:
df_interim.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41176 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41176 non-null  int64  
 1   job             41176 non-null  object 
 2   marital         41176 non-null  object 
 3   education       41176 non-null  object 
 4   default         41176 non-null  object 
 5   housing         41176 non-null  object 
 6   loan            41176 non-null  object 
 7   contact         41176 non-null  object 
 8   month           41176 non-null  object 
 9   day_of_week     41176 non-null  object 
 10  duration        41176 non-null  int64  
 11  campaign        41176 non-null  int64  
 12  pdays           41176 non-null  int64  
 13  previous        41176 non-null  int64  
 14  poutcome        41176 non-null  object 
 15  emp.var.rate    41176 non-null  float64
 16  cons.price.idx  41176 non-null  float64
 17  cons.conf.idx   41176 non-null 

* Convertir age en categorial con grupos de edad de 10 años: 

In [70]:
df_interim['age_bins'] = pd.cut(x=df_interim['age'], bins=[10,20,30,40,50,60,70,80,90,100])
df_interim[['age_bins','age']].head()

Unnamed: 0,age_bins,age
0,"(50, 60]",56
1,"(50, 60]",57
2,"(30, 40]",37
3,"(30, 40]",40
4,"(50, 60]",56


* Insertamos las categorías 'basic.9y','basic.6y','basic4y' en 'middle_school':

In [71]:
df_interim['education'] = df_interim['education'].replace({'basic.9y': 'middle_school', 'basic.6y': 'middle_school', 'basic.4y': 'middle_school'})


* Convertir la target variable y las categóricas en dummmies:

In [72]:
df_interim = pd.get_dummies(df_interim, columns=['y','age_bins','job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome'], drop_first=True)


In [73]:
df_interim.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41176 entries, 0 to 41187
Data columns (total 54 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            41176 non-null  int64  
 1   duration                       41176 non-null  int64  
 2   campaign                       41176 non-null  int64  
 3   pdays                          41176 non-null  int64  
 4   previous                       41176 non-null  int64  
 5   emp.var.rate                   41176 non-null  float64
 6   cons.price.idx                 41176 non-null  float64
 7   cons.conf.idx                  41176 non-null  float64
 8   euribor3m                      41176 non-null  float64
 9   nr.employed                    41176 non-null  float64
 10  y_yes                          41176 non-null  uint8  
 11  age_bins_(20, 30]              41176 non-null  uint8  
 12  age_bins_(30, 40]              41176 non-null 

* Scale data:

In [75]:
df_interim.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
count,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,...,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0
mean,40.0238,258.315815,2.567879,962.46481,0.173013,0.081922,93.57572,-40.502863,3.621293,5167.03487,...,0.334345,0.099573,0.017413,0.013843,0.206722,0.209297,0.196377,0.197542,0.863391,0.033345
std,10.42068,259.305321,2.770318,186.937102,0.494964,1.570883,0.578839,4.62786,1.734437,72.251364,...,0.471767,0.299433,0.130806,0.116841,0.40496,0.406812,0.397261,0.39815,0.343438,0.179537
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [76]:
df_interim = df_interim.drop(['duration','pdays'], axis=1)
#Dropping cols that are not useful

In [77]:
scaler = MinMaxScaler()
df_scaler = scaler.fit(df_interim[['age','campaign','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed']])
df_interim[['age','campaign','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed']] = df_scaler.transform(df_interim[['age','campaign','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed']])

In [78]:
df_interim.head()

Unnamed: 0,age,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y_yes,"age_bins_(20, 30]",...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,0.481481,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0,0,...,1,0,0,0,1,0,0,0,1,0
1,0.493827,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0,0,...,1,0,0,0,1,0,0,0,1,0
2,0.246914,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0,0,...,1,0,0,0,1,0,0,0,1,0
3,0.283951,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0,0,...,1,0,0,0,1,0,0,0,1,0
4,0.481481,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0,0,...,1,0,0,0,1,0,0,0,1,0


In [79]:
df = df_interim.copy()

In [80]:
df.to_csv('../data/processed/df_proccesed.csv')

* Modelling:

In [87]:
#Select features
X = df[['age', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'age_bins_(20, 30]', 'age_bins_(30, 40]', 'age_bins_(40, 50]', 'age_bins_(50, 60]', 'age_bins_(60, 70]', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student', 'job_technician', 'job_unemployed', 'marital_married', 'marital_single', 'education_middle_school', 'education_professional.course', 'education_university.degree', 'default_yes', 'housing_yes', 'loan_yes', 'contact_telephone', 'month_aug','poutcome_nonexistent', 'poutcome_success']]
y = df['y_yes']

In [88]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=25)

In [91]:
#Initialize model
model = LogisticRegression() 

In [92]:
#Model fit
model.fit(X_train, y_train) 

In [93]:
#Predict
y_pred = model.predict(X_test)

In [96]:
#Accuracy score
accuracy_score(y_test, y_pred) #El modelo tiene una accuracy de 0.89%

0.8975230694511899

In [98]:
print(classification_report(y_test, y_pred))

#Le está yendo relativamente bien con los que dice que son 0 (0.91 precision)
#con el 1 tiene solo 0.63

#Note to self:
#La precisión es de todos los que predice como 1, cuántos son 1 de verdad
#El recall es de todos los que son 1, cuántos diagnostico como 1

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      7308
           1       0.63      0.22      0.32       928

    accuracy                           0.90      8236
   macro avg       0.77      0.60      0.63      8236
weighted avg       0.88      0.90      0.87      8236



* Hypertuning:

In [114]:
#Grid search
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2'] 
c_values = [100, 10, 1.0, 0.1, 0.01]

grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='recall',error_score=0)
grid_result = grid_search.fit(X, y)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best: 0.211397 using {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.211326 (0.016772) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.211397 (0.016859) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.211326 (0.016772) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.210895 (0.017090) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.210895 (0.016944) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.210895 (0.016981) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.207445 (0.017958) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.207445 (0.017958) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.207733 (0.018135) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.194869 (0.017637) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.194726 (0.017679) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.195876 (0.017745) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.156931 (0.015855) with: {

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best: 0.211397 using {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}:


In [115]:
optimized_model = LogisticRegression(C= 100, penalty='l2', solver= 'lbfgs')

In [116]:
optimized_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [117]:
#Predict
y_pred_2 = optimized_model.predict(X_test)

In [118]:
#Score
accuracy_score(y_pred_2, y_test)

#Question: por qué no está cambiando el accuracy score? Las métricas son iguales a las del modelo anterior

0.8975230694511899

In [119]:
print(classification_report(y_test, y_pred_2))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      7308
           1       0.63      0.22      0.33       928

    accuracy                           0.90      8236
   macro avg       0.77      0.60      0.64      8236
weighted avg       0.88      0.90      0.87      8236

