In [1]:
import os
import numpy as np
import pandas as pd
import shap

from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    balanced_accuracy_score,
    make_scorer,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from xgboost import XGBClassifier
import ast

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
def get_x_y(df: pd.DataFrame) -> (pd.Series, pd.Series):
    """
    Get predictors and target from dataframe
    :param df: dataframe
    :return: predictors, target
    """
    x = df["text"]
    y = df["account.type"]
    y = np.where(y == "bot", 1, 0)
    return x, y

In [3]:
DATA_PATH = os.path.join("..", "data", "preprocessed_url_simple")
train = pd.read_csv(os.path.join(DATA_PATH, "train.csv"))
test = pd.read_csv(os.path.join(DATA_PATH, "test.csv"))

In [4]:
x_train, y_train = get_x_y(train)
x_test, y_test = get_x_y(test)

In [5]:
df = pd.read_csv("results/tfidf_optuna.csv")

In [6]:
MODELS_MAPPING = {
    "LGBM": LGBMClassifier,
    "XGB": RandomForestClassifier,
    "RF": LogisticRegression,
    "SVC": SVC,
    "LR": XGBClassifier,
}

In [7]:
KEYS_MAPPINGS = {
    "lgbm": "model__",
    "xgb": "model__",
    "rf": "model__",
    "svc": "model__",
    "tfidf": "encoder__",
}

In [10]:
NGRAMS_MAPPING = {"unigram": (1, 1), "digram": (1, 2), "trigram": (1, 3)}

In [8]:
_series = df.iloc[0, :]

In [9]:
_series

type                                          lemmatization
model                                                  LGBM
score                                              0.829501
params    {'lgbm_boosting_type': 'gbdt', 'lgbm_max_depth...
Name: 0, dtype: object

In [11]:
def load_model(
    metadata: pd.Series | dict,
    models_mapping: dict = MODELS_MAPPING,
    keys_mappings: dict = KEYS_MAPPINGS,
    ngrams_mapping: dict = NGRAMS_MAPPING,
) -> Pipeline:
    pipeline = Pipeline(
        [("encoder", TfidfVectorizer()), ("model", models_mapping[metadata["model"]]())]
    )

    params = ast.literal_eval(metadata["params"])
    params = {
        keys_mappings[k.split("_")[0]]
        + "_".join(k.split("_")[1:]): (
            v if v not in ngrams_mapping.keys() else ngrams_mapping[v]
        )
        for k, v in params.items()
    }
    pipeline.set_params(**params)
    return pipeline

In [12]:
pipeline = load_model(_series)

In [None]:
pipeline.fit(x_train, y_train)

In [14]:
encoder = pipeline.named_steps["encoder"]
model = pipeline.named_steps["model"]

In [15]:
explainer = shap.KernelExplainer(
    model.predict_proba, encoder.transform(x_train)
)  # model.predict for SVC, otherwise we can use predict_proba

Using 20712 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.




In [16]:
shap_values = explainer.shap_values(encoder.transform(x_test[:5]))

  0%|          | 0/5 [00:00<?, ?it/s]Converting data to scipy sparse matrix.




: 