### Importando Dependências

In [249]:
import os 
from google.cloud import bigquery
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'credentials/service_account.json'
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import joblib
import numpy as np
import pickle

### Consultando dados e Tratando

In [247]:
query = open('queries\interactions_conversions.sql').read()
df = pd.read_gbq(query, project_id = 'capim-404203', dialect= 'standard')

##FILTERING FOR USERS WITH NO INTERACTIONS
df = df.loc[df['ACTIVE_USER_DAYS'] > 0].reset_index(drop=True)

df['TRIAL_DURATION'] = df['TRIAL_DURATION'].astype('int')
df['CLINIC_ID'] = df['CLINIC_ID'].astype('int')
df['IS_CONVERTED'] = df['IS_CONVERTED'].replace({True: 1, False: 0})
df['IS_CONVERTED'] = df['IS_CONVERTED'].astype('int')
df['ACTIVE_PATIENT_DAYS'] = df['ACTIVE_PATIENT_DAYS'].astype('float')
df['ACTIVE_FINANCE_DAYS'] = df['ACTIVE_FINANCE_DAYS'].astype('float')
df['ACTIVE_SETUP_DAYS'] = df['ACTIVE_SETUP_DAYS'].astype('float')
df['ACTIVE_SCHEDULE_DAYS'] = df['ACTIVE_SCHEDULE_DAYS'].astype('float')
df['ACTIVE_USER_DAYS'] = df['ACTIVE_USER_DAYS'].astype('float')
df['ACTIVE_MODULES_USED'] = (df['ACTIVE_PATIENT_DAYS'] > 0).astype('int') + (df['ACTIVE_FINANCE_DAYS'] > 0).astype('int') + (df['ACTIVE_SETUP_DAYS'] > 0).astype('int') + (df['ACTIVE_SCHEDULE_DAYS'] > 0).astype('int')


### Definindo variáveis e transformando

In [250]:
Y = df['IS_CONVERTED']
X =  df[['ACTIVE_USER_DAYS']]
scaler = StandardScaler()
scaler.fit(X)
# Transformando
X_transformed = scaler.transform(X)
# Salvando scaler
joblib.dump(scaler, 'models\scaler.save') 

### Treinando Modelos

In [261]:
#Definindo os dataframes do teste e treino
x_train, x_test, y_train, y_test = train_test_split(X_transformed, Y, random_state=35)

# Criando objeto
log_reg = LogisticRegression(fit_intercept=False, C=1)
# Dando fit no modelo
log_reg.fit(x_train, y_train)

#Prevendo
y_pred = log_reg.predict(x_test)



### Avaliando o modelo

In [258]:
#Test Score
log_reg.score(x_test,y_test)

0.7772727272727272

In [222]:
# Train Score
log_reg.score(x_train,y_train)

0.7496598639455783

In [None]:
#Show the Confusion Matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Show to Coeficient and Intercept
print(log_reg.coef_)
print(log_reg.intercept_)

In [171]:
# Simulate for visuals
new_x = pd.DataFrame(scaler.transform(pd.DataFrame(list(np.arange(0,10.5,.5)))), columns = ['ACTIVE_USER_DAYS_TRANSFORMED'])
new_x['ACTIVE_USER_DAYS'] = list(np.arange(0,10.5,.5))
probability_of_conversion = log_reg.predict_proba(new_x[['ACTIVE_USER_DAYS_TRANSFORMED']])[:,-1]
nex_table = new_x.copy()
nex_table['probability'] = probability_of_conversion

### Salvando o modelo

In [256]:
# SAVE THE MODEL
with open("models/logit.pickle", 'wb') as f:
    pickle.dump(log_reg, f)