In [105]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [106]:
df = pd.read_csv('data-week-3.csv')

In [107]:
df.columns = df.columns.str.lower().str.replace(' ','_') #cambiamos el nombre de columnas a uno estandarizado

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index) #retorna el nombre de las columnas categoricas

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ','_') #estandarizamos los datos de cada columna

In [108]:
tc = pd.to_numeric(df.totalcharges, errors = 'coerce') #ignora los datos faltas en el caso de totalcharges

In [109]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors = 'coerce') #convertimos los valores en el dataframe
df.totalcharges = df.totalcharges.fillna(0) #replazamos los na 

In [110]:
df.churn = (df.churn == 'yes').astype(int) #si es True, convierte en 1, de lo contrario, en 0

In [111]:
from sklearn.model_selection import train_test_split #divide el data frame de manera aleatoria

In [112]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1) #se pasa el porcentaje y semilla de random
len(df_full_train), len(df_test) #tamaños de los dataframes

(5634, 1409)

In [113]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
len(df_test),len(df_train),len(df_val)

(1409, 4225, 1409)

In [114]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [115]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [116]:
numerical = ['tenure','monthlycharges','totalcharges']
categorical = [col for col in df_full_train.columns if col not in numerical + ['customerid', 'churn']]
categorical, numerical

(['gender',
  'seniorcitizen',
  'partner',
  'dependents',
  'phoneservice',
  'multiplelines',
  'internetservice',
  'onlinesecurity',
  'onlinebackup',
  'deviceprotection',
  'techsupport',
  'streamingtv',
  'streamingmovies',
  'contract',
  'paperlessbilling',
  'paymentmethod'],
 ['tenure', 'monthlycharges', 'totalcharges'])

In [117]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records') #orient lo ordena mejor
train_dicts[0]

{'gender': 'female',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'yes',
 'onlinebackup': 'yes',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'two_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'electronic_check',
 'tenure': 72,
 'monthlycharges': 115.5,
 'totalcharges': 8425.15}

In [118]:
dv = DictVectorizer(sparse=False)

In [119]:
dv.fit(train_dicts) #se le enseña la forma en la que se van a entregar los datos
X_train = dv.transform(train_dicts)
X_train.shape

(4225, 45)

In [120]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records') 
X_val = dv.transform(val_dicts)
X_val.shape

(1409, 45)

In [121]:
model = LogisticRegression()
y_train.shape

(4225,)

In [122]:
model.fit(X_train, y_train) #ajustamos el módelo para hallar los pesos

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [123]:
def train(df_train, y_train, C = 1.0):
    dicts = df_train[categorical + numerical].to_dict(orient = 'records')

    dv = DictVectorizer(sparse = False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(C=C, max_iter =10000)
    model.fit(X_train, y_train)

    return dv, model

In [124]:
dv, model = train(df_train, y_train, C = 0.00010)

In [125]:
def predict(df, dv, model):
    dicts = df[categorical + numerical].to_dict(orient='records')

    X = dv.transform(dicts)

    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [126]:
y_pred = predict(df_val, dv, model)
y_pred

array([0.03807743, 0.24553116, 0.24466683, ..., 0.27936154, 0.75199467,
       0.74491185], shape=(1409,))

In [127]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score

In [128]:
kfold = KFold(n_splits=10, shuffle=True, random_state=1)

In [129]:
from tqdm.auto import tqdm

In [130]:
n_splits = 5

for C in [0.0001, 0.01, 0.1, 0.5, 1, 10]:
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
    
    scores = []

    for train_idx, val_idx in tqdm(kfold.split(df_full_train), total = n_splits):
        df_train = df_full_train.iloc[train_idx]
        df_val = df_full_train.iloc[val_idx]
    
        y_train = df_train.churn.values
        y_val = df_val.churn.values
    
        dv, model = train(df_train, y_train, C)
        y_pred = predict(df_val, dv, model)
        
        auc = roc_auc_score(y_val, y_pred)
        scores.append(auc)

    print(C, np.mean(scores), np.std(scores))

  0%|          | 0/5 [00:00<?, ?it/s]

0.0001 0.8105262961888036 0.009166223110867528


  0%|          | 0/5 [00:00<?, ?it/s]

0.01 0.8399527427063243 0.008235658006889298


  0%|          | 0/5 [00:00<?, ?it/s]

0.1 0.841870829966315 0.0072342930187319285


  0%|          | 0/5 [00:00<?, ?it/s]

0.5 0.8419397414804113 0.006870623407195754


  0%|          | 0/5 [00:00<?, ?it/s]

1 0.8418460182682992 0.0069071206359891


  0%|          | 0/5 [00:00<?, ?it/s]

10 0.8417855457979412 0.0067108721219644495


In [131]:
scores

[np.float64(0.8439646442841331),
 np.float64(0.8448115315852205),
 np.float64(0.833694561509131),
 np.float64(0.8348128637206625),
 np.float64(0.8516441278905589)]

In [132]:
dv, model = train(df_full_train, df_full_train.churn, C=1.0)
y_pred = predict(df_test, dv, model)

y_test = df_test.churn.values
auc = roc_auc_score(y_test, y_pred)
auc

np.float64(0.8584032088573997)

## 5.1 Subir el modelo a servicio web para usarlo en cualquier momento

In [133]:
df_test

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,8879-zkjof,female,0,no,no,41,yes,no,dsl,yes,...,yes,yes,yes,yes,one_year,yes,bank_transfer_(automatic),79.85,3320.75,0
1,0201-mibol,female,1,no,no,66,yes,yes,fiber_optic,yes,...,no,no,yes,yes,two_year,yes,bank_transfer_(automatic),102.40,6471.85,0
2,1600-dilpe,female,0,no,no,12,yes,no,dsl,no,...,no,no,no,no,month-to-month,yes,bank_transfer_(automatic),45.00,524.35,0
3,8601-qacrs,female,0,no,no,5,yes,yes,dsl,no,...,no,no,no,no,month-to-month,yes,mailed_check,50.60,249.95,1
4,7919-zodzz,female,0,yes,yes,10,yes,no,dsl,no,...,yes,no,no,yes,one_year,yes,mailed_check,65.90,660.05,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1404,5130-iekqt,male,1,no,no,25,yes,yes,fiber_optic,no,...,yes,no,yes,yes,month-to-month,no,mailed_check,105.95,2655.25,1
1405,4452-rohmo,female,0,no,no,15,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.60,331.60,0
1406,6164-haqtx,male,0,no,no,71,no,no_phone_service,dsl,yes,...,yes,yes,yes,no,two_year,no,bank_transfer_(automatic),53.95,3888.65,0
1407,3982-dqlus,male,1,yes,yes,65,yes,yes,fiber_optic,yes,...,no,no,no,no,month-to-month,yes,electronic_check,85.75,5688.45,0


### Guardar el modelo

In [134]:
import pickle #para guardarlo

In [135]:
output_file = f'model_C={C}.bin'
output_file

'model_C=10.bin'

In [139]:
f_out = open(output_file, 'wb') #write-binary = wb
pickle.dump((dv, model), f_out) #guardamos el dict_vectorizer para la feature matrix y el modelo
f_out.close() #cerramos el archivo IMPORTANTE

In [140]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out) #se cierra automáticamente  

### Cargar el modelo

In [None]:
import pickle 

In [143]:
with open(output_file, 'rb') as f_in: #read-binary = rb
    (dv, model) = pickle.load(f_in) #se cierra automáticamente 

In [144]:
dv, model

(DictVectorizer(sparse=False), LogisticRegression(max_iter=10000))

In [152]:
customer = {
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'yes',
    'dependents': 'no',
    'phoneservice': 'no',
    'multiplelines': 'no_phone_service',
    'internetservice': 'dsl',
    'onlinesecurity': 'no',
    'onlinebackup': 'yes',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'no',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 29.85,
    'totalcharges': 29.85
}

In [154]:
X = dv.transform([customer])

In [156]:
model.predict_proba(X)[0, 1]

np.float64(0.6275953527536646)

## 5.3 Guardar el modelo en un web service

#### VSC ping.py