In [113]:
import joblib
import mlflow
import numpy as np
import pandas as pd
from pickle import dump
from kedro.io import PickleLocalDataSet
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report, f1_score

In [121]:
X_train = pd.read_pickle("data/05_model_input/X_train.pkl")
X_test = pd.read_pickle("data/05_model_input/X_test.pkl")
y_train = pd.read_pickle("data/05_model_input/y_train.pkl")
y_test = pd.read_pickle("data/05_model_input/y_test.pkl")

In [115]:
def run_logreg(X_train: PickleLocalDataSet, X_test: PickleLocalDataSet, y_train: PickleLocalDataSet, y_test: PickleLocalDataSet, log=False) -> PickleLocalDataSet:
    model = LogisticRegression(random_state=0).fit(StandardScaler().fit_transform(X_train), y_train)
    y_pred = model.predict(StandardScaler().fit_transform(X_test))
    params = {
    }

    f1 = f1_score(y_test, y_pred, average="weighted")

    joblib.dump(model, 'data/06_models/logreg.pkl')
    
    
    print(classification_report(y_test, y_pred, digits=5))
    
    if log:
        mlflow.set_tracking_uri("databricks")
        mlflow.set_experiment("/Users/firefly.eugene@gmail.com/twitter-bot-detection")

        run_id = mlflow.search_runs(experiment_ids="3889491181315524", filter_string="tags.`mlflow.runName`='logreg'", run_view_type=1)["run_id"][0]    
        mlflow.start_run(run_id=run_id, nested=False)

        with mlflow.start_run(nested=True):
            mlflow.set_tags({
                "lib": "sklearn",
                "features": X_train.columns.values,
                "description": "Standard scaler added"
            })

            mlflow.log_params(params)
            mlflow.log_metric("f1", f1, 1)
            mlflow.log_artifact('data/05_model_input/X_test.pkl')
        mlflow.end_run()
    return model

In [116]:
%%time
m = run_logreg(X_train, X_test, y_train, y_test, log=True);

              precision    recall  f1-score   support

           0    0.89904   0.89721   0.89812      4903
           1    0.79156   0.79485   0.79320      2408

    accuracy                        0.86349      7311
   macro avg    0.84530   0.84603   0.84566      7311
weighted avg    0.86364   0.86349   0.86357      7311

CPU times: user 2.08 s, sys: 1.69 s, total: 3.77 s
Wall time: 14.8 s


In [128]:
from sklearn.linear_model import RidgeClassifierCV
clf = RidgeClassifierCV().fit(StandardScaler().fit_transform(X_train), y_train)
pred = clf.predict(StandardScaler().fit_transform(X_test))
print(classification_report(y_test, pred, digits=5))

              precision    recall  f1-score   support

           0    0.89145   0.88436   0.88789      4903
           1    0.76829   0.78073   0.77446      2408

    accuracy                        0.85023      7311
   macro avg    0.82987   0.83254   0.83117      7311
weighted avg    0.85088   0.85023   0.85053      7311

