In [None]:
#### para instalar skopt #######
#!pip install scikit-optimize

In [None]:
#### Precisamos correr estas líneas en MAC para que ande el multiprocesamiento del crossvalidation###
import multiprocessing as mp; mp.set_start_method('forkserver')

In [None]:
# para ignorar warnings (no funciona como esperaba, hay que darle una vuelta de rosca mas)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
from ipynb.fs.defs.pipeline import dataset_pipeline

import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import dill
import pickle
import gzip

#Librerias
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

# Classifiers
#import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
#from sklearn.naive_bayes import MultinomialNB
#from sklearn.linear_model import LogisticRegression

#Optimizador bayesiano
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [None]:
#### LOAD DATASET CON PICKLE #######
#### Este dataset ya tiene generado el preprocesamiento del texto #######
pickle_file = open('./data/dataframe_preproc.pkl','rb')

df = pickle.load(pickle_file)
df = df.sample(frac=1, random_state=12).copy() # mezclo los datos para que los subsambling sean aleatorios
pickle_file.close()

df=df[:3000]

In [None]:
#instancio un pipeline y un clasificador RF, con los resultados obtenidos de la ejecución del optimizador
data_set_pipeline = dataset_pipeline(LSA_K = 83,LDA_TOPICS = 214,max_df_tfidf=0.7, min_df_tfidf=0.3,
                    max_df_cv=0.7 , min_df_cv=0.3)
rf_clf = RandomForestClassifier(n_estimators=52, max_depth=6,max_features='sqrt',n_jobs=-1)
clf_pipeline = Pipeline([("dataset",data_set_pipeline)
                         ,("clf", rf_clf)
                        ])

In [None]:
FEATURE_NAME = "preproc_text"#"raw_text"
CLASS_NAME = "rating"

X = df[FEATURE_NAME]
y = df[CLASS_NAME]
X_train, X_test, y_train, y_test = train_test_split(X, y.isin(["Buy cinema ticket"]), test_size=0.20, random_state=42)
#Entrenamiendo y testeo
fit_pipeline=clf_pipeline.fit(X_train,y_train)
test_predict_proba=clf_pipeline.predict_proba(X_test)
test_predict=clf_pipeline.predict(X_test)

In [None]:
#Armo el dataframe con las probabilidades predichas y la clase real
df_test_pred_proba=DataFrame(test_predict_proba)
df_test_pred_proba.columns=['false_prob','true_prob']
df_test_pred_proba['tconst']=DataFrame(y_test).reset_index().tconst
df_test_pred_proba['real_value']=DataFrame(y_test).reset_index().rating


In [None]:
from sklearn.metrics import confusion_matrix
# veo los resultados del testing
y_test_pred_pc=df_test_pred_proba.true_prob>0.3 #punto de corte seleccionado
df_test_pred_proba.real_value
confusion_matrix(df_test_pred_proba.real_value, y_test_pred_pc)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(df_test_pred_proba.real_value, y_test_pred_pc))

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(df_test_pred_proba.real_value, df_test_pred_proba.true_prob)

In [None]:
df_test_pred_proba.to_excel('rf_pred_3000.xlsx')

In [None]:
plot_clf_evaluation(df_test_pred_proba.real_value, y_test_pred_pc,df_test_pred_proba.true_prob)

In [None]:
############################
#### distribucion de probabilidades
df_test_pred_proba.groupby("real_value")["true_prob"].hist(bins=100)

In [None]:
%matplotlib inline
from sklearn.metrics import roc_curve, auc, confusion_matrix , roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
def plot_clf_evaluation(y,y_pred_value,y_pred_proba, title=None):
    
    title="Random Forest"
    fig, (ax1,ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
    fpr, tpr, thresholds = roc_curve(y, y_pred_proba)
    roc_auc = roc_auc_score(y,y_pred_proba)
    print(roc_auc)
    label = "AUC: %.2f"%(roc_auc )

    ax1.plot(fpr, tpr, label=label )
    ax1.plot([0, 1], [0, 1], 'k--')
    ax1.set_xlim([-0.05, 1.05])
    ax1.set_ylim([-0.05, 1.05])
    ax1.set_xlabel('False Positive Rate')
    ax1.set_ylabel('True Positive Rate')
    ax1.set_title('ROC')
    ax1.legend(loc="lower right")

    cfn_matrix = confusion_matrix(y, y_pred_value)

    cfn_matrix = cfn_matrix.astype(float) / cfn_matrix.sum(axis=1)[:, np.newaxis]

    sns.heatmap(cfn_matrix, annot=True, ax=ax2 , fmt=".2f", vmin=0, vmax=1, cmap="Blues")
    ax2.set_xticklabels([False,True])
    ax2.set_yticklabels([False,True])
    _=ax2.set_ylabel("True Values")
    _=ax2.set_xlabel("Predicted Values")


    if title:
        fig.suptitle(title)


############################
#### distribucion de probabilidades


#pred_df.groupby("real_values")["True_prob"].hist(bins=100)

In [None]:
%%time
#### SIMPLE CV Train

FEATURE_NAME = "preproc_text"
CLASS_NAME = "rating"

X = df[FEATURE_NAME]
y = df[CLASS_NAME]
X_train, X_test, y_train, y_test = train_test_split(X, y.isin(["Buy cinema ticket"]), test_size=0.20, random_state=42)

cv_results = cross_validate(clf_pipeline, X_train, y_train, scoring=["f1","roc_auc","accuracy","recall"], return_train_score=True)
simple_cv_result_df = DataFrame(cv_results)
simple_cv_result_df

In [None]:
######### OPTIMIZACION BAYESIANA
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

pickle_file_preproc = open('./data/dataframe_preproc.pkl','rb')
df_preproc = pickle.load(pickle_file_preproc)
df_preproc = df_preproc.sample(frac=1, random_state=12).copy() # mezclo los datos para que los subsambling sean aleatorios
pickle_file_preproc.close()
df_preproc=df_preproc[:3000]
FEATURE_NAME = "preproc_text"#"raw_text"
CLASS_NAME = "rating"


X = df_preproc[FEATURE_NAME]
y = df_preproc[CLASS_NAME]
X_train, X_test, y_train, y_test = train_test_split(X, y.isin(["Buy cinema ticket"]), test_size=0.20, random_state=42)

skopt_grid = { # se arma con los parametros  que se listan en la celda a continuacion

'dataset__text_features__topics__tf_vec__max_df':Real(0.7,1.0),
'dataset__text_features__topics__tf_vec__max_df':Real(0.0,0.3),
###### LDA ###############
'dataset__text_features__topics__topic_features__lda__learning_decay':Real(0.5, 1.0),
'dataset__text_features__topics__topic_features__lda__max_iter': Integer(50, 500),
'dataset__text_features__topics__topic_features__lda__n_components': Integer(100, 1000),
#### RF parameters #####
'clf__n_estimators': Integer(20, 200), 
'clf__max_depth': Integer(4, 12),
'clf__max_depth': ("auto","sqrt","log2")
}


#Defino funcion de callback para ir guardando los resultados y bkp del optimizador
def on_step(optim_result):
    DataFrame(opt.cv_results_).to_excel("rf_opt_result_3000.xlsx")
    DataFrame(opt.grid_scores_).to_excel("rf_opt_grid_scores_3000.xlsx")
    pickle.dump( opt, open( "rf_opt_skpot_2000.pkl", "wb" ), protocol=2 )
    score = opt.best_score_
    print("best score: %s" % score)



# log-uniform: understand as search over p = exp(x) by varying x
opt = BayesSearchCV(
    clf_pipeline,
    skopt_grid,
    n_iter=100,n_jobs=-1,n_points=10,scoring="roc_auc",cv=3
)

res = opt.fit(X_train, y_train,callback=on_step)