In [None]:
# !pip install matplotlib sklearn imblearn catboost hyperopt seaborn numpy==1.19

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from catboost import CatBoostClassifier, cv, Pool
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

warnings.filterwarnings('ignore')
pd.set_option('use_inf_as_na', True)

In [None]:
df = pd.read_csv('./data/model.csv')

In [None]:
df['created_at'] = pd.to_datetime(df['created_at'], utc=True, format='%Y-%m-%d %H:%M:%S')
df['observed_at'] = pd.to_datetime(df['observed_at'], utc=True, format='%Y-%m-%d %H:%M:%S')
df['default_profile'] = df.apply(lambda x: int(x['default_profile']), axis = 1)
df['verified'] = df.apply(lambda x: int(x['verified']), axis = 1)

In [None]:
DATA_V1 = df
DATA_V1

In [None]:
def plot_feature_importance(importance, names, model_type):
    # Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    # Create a DataFrame using a Dictionary
    data={ 'feature_names': feature_names, 'feature_importance': feature_importance }
    fi_df = pd.DataFrame(data)
    
    # Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by = ['feature_importance'], ascending = False, inplace = True)
    
    # Define size of bar plot
    plt.figure(figsize = (10,8))
    
    # Set theme
    sns.set_theme()
    
    # Plot Searborn bar chart
    sns.barplot(x = fi_df['feature_importance'], y = fi_df['feature_names'])
    
    # Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

In [None]:
labels_v1 = DATA_V1[['label']]
labels_list_v1 = list(labels_v1.columns)

In [None]:
features_v1 = DATA_V1[[
    'tweets', # twitter
    'followers', # twitter
    'following', # twitter
    'favorites', # twitter
    'listed', # twitter
    'default_profile', # twitter
    'verified', # twitter
    'actions_frequency', # tweets + favorites / dates_since (andefined)
    'tweets_freqquency', # tweets / dates_since (Yang et al.)
    'reputation', # followers / followers + friends (Feng et al.)
    'followers_growth_rate', # followers / dates_since (Yang et al.)
    'following_growth_rate', # following / dates_since (Yang et al.)
    'favorites_growth_rate', # favorites / dates_since (Yang et al.)
    'listed_growth_rate', # listed / dates_since (Yang et al.)
    'followers_following_ratio', # followers / following (andefined)    
    'credibility', # listed / followers + listed andefined
    'tweets_favorites_ratio' # tweets / favorites andefined
]]
feature_list_v1 = list(features_v1.columns)

In [None]:
train_features_v1, test_features_v1, train_labels_v1, test_labels_v1 = train_test_split(
    features_v1, labels_v1, train_size = 0.9, random_state = 42
)

In [None]:
print('Training Features Shape:', train_features_v1.shape)
print('Training Labels Shape:', train_labels_v1.shape)
print('Testing Features Shape:', test_features_v1.shape)
print('Testing Labels Shape:', test_labels_v1.shape)

In [None]:
train_pool_v1 = Pool(train_features_v1, train_labels_v1)
eval_pool_v1 = Pool(test_features_v1, test_labels_v1)

In [None]:
def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg = params['l2_leaf_reg'],
        learning_rate = params['learning_rate'],
        depth = params['depth'],
        iterations = 1000,
        eval_metric = 'Accuracy',
        random_seed = 42,
        loss_function = 'MultiClassOneVsAll',
    )
    
    cv_data = cv(
        train_pool_v1,
        model.get_params(),
        logging_level = 'Silent',
        plot = True
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

In [None]:
params_space = {
    'l2_leaf_reg':   hp.quniform('l2_leaf_reg', 2, 30, 2),
    'depth':         hp.quniform('depth', 6, 10, 1),
    'learning_rate': hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = Trials()
best_hyperparams = fmin(
    fn = hyperopt_objective,
    space = params_space,
    algo = tpe.suggest,
    max_evals = 18,
    trials = trials,
)

In [None]:
print(best_hyperparams)

In [None]:
params = {
    'iterations': 5000,
    'learning_rate': 0.3,
    'depth': 8,
    'l2_leaf_reg': 4,
    'loss_function': 'MultiClassOneVsAll',
    'custom_loss': ['MultiClassOneVsAll','Accuracy', 'F1', 'Recall'],
    'eval_metric':'AUC:hints=skip_train~false',
    'gpu_ram_part': 0.95,
    'random_seed': 42,
    'task_type': 'GPU',
    'devices': '0',
    'logging_level': 'Silent',
    'use_best_model': True
}

model_v1 = CatBoostClassifier(**params)
model_v1.fit(train_pool_v1, eval_set = eval_pool_v1, plot = True)

In [None]:
print('Model params')
print(model_v1.get_params())

print("Best Score")
print(model_v1.get_best_score())

In [None]:
predictions_cat_v1 = model_v1.predict(test_features_v1)

In [None]:
accuracy_v1 = accuracy_score(test_labels_v1, predictions_cat_v1)
print('Score:', accuracy_v1)
print('Precision Score:', round(precision_score(test_labels_v1, predictions_cat_v1, average = 'micro') * 100, 2), '%')
print('Recall Score:', round(recall_score(test_labels_v1, predictions_cat_v1, average = 'micro'), 2) * 100, '%')
print('F1 Score:', round(f1_score(test_labels_v1, predictions_cat_v1, average = 'micro'), 2) * 100, '%')

In [None]:
report_v1 = classification_report(test_labels_v1, predictions_cat_v1)
print(report_v1)

In [None]:
plot_feature_importance(model_v1.get_feature_importance(), feature_list_v1, 'CATBOOST V1 ')

In [None]:
model_v1.save_model('../models/account_catboost_classifier_twitter_v1-no_digits.model')