# Model training

#### Requirements

In [None]:
!pip install pandas
!pip install google-cloud-bigquery
!pip install google-cloud-storage
!pip install pandas-gbq

En este apartado vamos a entrenar nuestros modelos. Necesitaremos tres: mensaje de cookies, botón aceptar cookies y botón cerrar cookies.


## Mensaje de cookies

Este modelo tiene que predecir la probabilidad de que un elemento HTML sea de un mensaje de cookies

#### Get the Data
Como los datos los hemos preparado en el Notebook anterior ahora los cargamos.

In [None]:
import pickle

df_train = pickle.load(open('df_message_train.pkl', 'rb'))
df_test = pickle.load(open('df_message_test.pkl', 'rb'))


#### Vectorize
Convertimos nuestros dataset de test y train en vectores

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Seleccionamos los textos del dataset de train para "entrenar" al vectorizer
message_train = df_train['text']


vectorizer = CountVectorizer()
vectorizer.fit(message_train)

X_train = vectorizer.transform(df_train['text'])
X_test  = vectorizer.transform(df_test['text'])
X_train


#### Model fit
Entrenamos nuestro modelo

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, df_train['label'])



#### Model test
Entrenamos nuestro modelo

In [None]:

score = classifier.score(X_test, df_test['label'])

print("Accuracy:", score)



#### Confusion matrix
Vemos la matriz de confusión de nuestro modelo

In [None]:
y_pred = classifier.predict(X_test)  


from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(df_test['label'],y_pred))  
print(classification_report(df_test['label'],y_pred))  


Debido al buen funcionamiento de este modelo no seguimos buscando otros tipos.

In [None]:
# Guardamos en un pickle
import pickle

with open('classifier_message.pkl', 'wb') as f:
    pickle.dump(classifier, f)
    
with open('vectorizer_message.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

## Aceptar cookies

Este modelo tiene que predecir la probabilidad de que un elemento HTML sea el botón de aceptar las cookies.

#### Get the Data
Como los datos los hemos preparado en el Notebook anterior ahora los cargamos.

In [None]:
import pickle

df_aceptar_train = pickle.load(open('df_aceptar_train.pkl', 'rb'))
df_aceptar_train_label = pickle.load(open('df_aceptar_train_label.pkl', 'rb'))

df_aceptar_test = pickle.load(open('df_aceptar_test.pkl', 'rb'))
df_aceptar_test_label = pickle.load(open('df_aceptar_test_label.pkl', 'rb'))


### Test multiple classificators

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

clf = LogisticRegression()
clf.fit(df_aceptar_train, df_aceptar_train_label)

score = clf.score(df_aceptar_test, df_aceptar_test_label)


print("Accuracy:", score)

y_pred = clf.predict(df_aceptar_test)  

from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(df_aceptar_test_label,y_pred))  
print(classification_report(df_aceptar_test_label,y_pred))  


#### Stochastic Gradient Descent (SGD) 

In [None]:
from sklearn.linear_model import SGDClassifier

#clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
clf_SGD = SGDClassifier(loss="log", penalty="l2", max_iter=10)

clf_SGD.fit(df_aceptar_train, df_aceptar_train_label)   

score = clf_SGD.score(df_aceptar_test, df_aceptar_test_label)

print("Accuracy:", score)
y_pred = clf_SGD.predict(df_aceptar_test)  

from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(df_aceptar_test_label,y_pred))  
print(classification_report(df_aceptar_test_label,y_pred))

#### Support Vector Classification (SVC)

In [None]:
from sklearn.svm import SVC  


clf = SVC(kernel='linear',probability=True)  
clf.fit(df_aceptar_train, df_aceptar_train_label) 

y_pred = clf.predict(df_aceptar_test)  

score = clf.score(df_aceptar_test, df_aceptar_test_label)


print("Accuracy:", score)

from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(df_aceptar_test_label,y_pred))  
print(classification_report(df_aceptar_test_label,y_pred)) 

#### Choose best estimator

Escogemos el estimador SGDClassifier debido a que tiene le mejor accuracy y sobre todo el mejor recall.

In [None]:
# Guardamos en un pickle
import pickle

with open('classifier_aceptar.pkl', 'wb') as f:
    pickle.dump(clf_SGD, f)

## Cerrar cookies

Este modelo tiene que predecir la probabilidad de que un elemento HTML sea el botón de cerrar las cookies.

#### Get the Data
Como los datos los hemos preparado en el Notebook anterior ahora los cargamos.

In [None]:
import pickle

df_cerrar_train = pickle.load(open('df_cerrar_train.pkl', 'rb'))
df_cerrar_train_label = pickle.load(open('df_cerrar_train_label.pkl', 'rb'))

df_cerrar_test = pickle.load(open('df_cerrar_test.pkl', 'rb'))
df_cerrar_test_label = pickle.load(open('df_cerrar_test_label.pkl', 'rb'))


### Test multiple classificators

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

clf = LogisticRegression()
clf.fit(df_cerrar_train, df_cerrar_train_label)

score = clf.score(df_cerrar_test, df_cerrar_test_label)


print("Accuracy:", score)

y_pred = clf.predict(df_cerrar_test)  

from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(df_cerrar_test_label,y_pred))  
print(classification_report(df_cerrar_test_label,y_pred))  


#### Stochastic Gradient Descent (SGD) 

In [None]:
from sklearn.linear_model import SGDClassifier

#clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
clf_SGD = SGDClassifier(loss="log", penalty="l2", max_iter=100)

clf_SGD.fit(df_cerrar_train, df_cerrar_train_label)   

score = clf_SGD.score(df_cerrar_test, df_cerrar_test_label)

print("Accuracy:", score)
y_pred = clf.predict(df_cerrar_test)  

from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(df_cerrar_test_label,y_pred))  
print(classification_report(df_cerrar_test_label,y_pred))

#### Support Vector Classification (SVC)

In [None]:
from sklearn.svm import SVC  


clf = SVC(kernel='linear',probability=True)  
clf.fit(df_cerrar_train, df_cerrar_train_label) 

y_pred = clf.predict(df_cerrar_test)  

score = clf.score(df_cerrar_test, df_cerrar_test_label)


print("Accuracy:", score)

from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(df_cerrar_test_label,y_pred))  
print(classification_report(df_cerrar_test_label,y_pred)) 

#### Choose best estimator

In [None]:
# Guardamos en un pickle
import pickle

with open('classifier_cerrar.pkl', 'wb') as f:
    pickle.dump(clf_SGD, f)