# Build BentoML model for Twitter bot classification problem


## Load libraries

In [1]:
import datetime

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier

import bentoml

# Load data and prepare

In [2]:
data = "https://raw.githubusercontent.com/darkcorpd/ml-zoomcamp/main/twitter-bots-classification/data/twitter_profiles_dataset.csv"
df = pd.read_csv(data)

In [3]:
# convert to datetime datatype
df['created_at'] = pd.to_datetime(df['created_at'])

today = datetime.datetime.now(df.created_at[0].tzinfo)
df['life_span'] = (today - df['created_at']).dt.days

In [4]:
numerical = ['statuses_count',
             'followers_count',
             'friends_count',
             'favourites_count',
             'listed_count',
             'media_count',
             'life_span']

categorical = ['default_profile_image',
              'geo_enabled',
              'protected',
              'verified',
              'has_custom_timelines',
              'advertiser_account_type']

features = numerical + categorical

In [5]:
df = df[features + ['bot']]

# Setting up the validation framework

In [6]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=2)
df_full_train = df_full_train.reset_index(drop=True)

y_full_train = df_full_train.bot.values

del df_full_train['bot']

## One-hot encoding

In [7]:
dicts_full_train = df_full_train.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dicts_full_train)

## Training Random forest
as the best selected model

In [8]:
model = RandomForestClassifier(n_estimators=90,
                            max_depth=10,
                            min_samples_leaf=1,
                            random_state=1)
model.fit(X_full_train, y_full_train)

RandomForestClassifier(max_depth=10, n_estimators=90, random_state=1)

## Save the model into the bentoml.yaml file

In [9]:
bentoml.sklearn.save_model(
    'twitter_bot_classify_model',   # model name in the local model store
    model,  # model instance being saved
    custom_objects={
        'dictVectorizer': dv
    },
    labels={    # user-defined labels for managing models in Yatai
        "owner": "darkcorp",
        "project": "ml-zoomcamp",
    },
    signatures={
        'predict': {
            'batchable': True,
            'batch_dim': 0,
        }
    }
)

Model(tag="twitter_bot_classify_model:7lt5hulbdgrrvodq", path="C:\Users\darkcorp\bentoml\models\twitter_bot_classify_model\7lt5hulbdgrrvodq\")