In [1]:
import numpy as np
from scipy.stats import multivariate_normal as Normal
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from matplotlib.colors import LinearSegmentedColormap
from sklearn.naive_bayes import GaussianNB
from math import sqrt
from sklearn.metrics import f1_score, roc_curve
from tqdm import tqdm
import pandas as pd

In [2]:
class NaiveBayes_CostSensitive:
    def __init__(self, x=None,y=None,cost=None):
        """
        NaiveBayes creates a cost-sensitive binary classifier based on dataset {x,y} 
        applying a specified cost over the feature distributions. Assumes
        M features and binary class.
        
        x: Feature matrix. Should be size N x M
        y: Class vector. Should be size N x 1
        cost: Should be between 0 and 1. 
        
        """
        try:
            self.fit(x,y,cost)
        except:
            pass
    
    def fit(self, x, y, cost = None):
        
        number_features = np.shape(x)[1]
        
        # Get data for minority and majority class
        
        filter_indices_mi = np.where(y_train>0.5)[0]
        filter_indices_ma = np.where(y_train<0.5)[0]
        axis = 0
        

        x_mi = np.take(x, filter_indices_mi, axis)
        x_ma = np.take(x, filter_indices_ma, axis)
        
        
        y_mi, y_ma = y[np.where(y>0.5)[0]], y[np.where(y<0.5)[0]]
        self.y_mi, self.y_ma = y_mi[0], y_ma[0]
        
        # Get priors minority and majority class
        self.prior_mi = len(y_mi)/(len(y_mi)+len(y_ma))
        self.prior_ma = len(y_ma)/(len(y_mi)+len(y_ma))
        
        # Get mean and covariance based on data
        #self.cov_mi, self.cov_ma = np.zeros([number_features,number_features]), np.zeros([number_features,number_features])
        
        self.mean_mi = np.mean(x_mi, axis = 0).to_numpy() # Array. Mean for each of the M features
        self.mean_ma = np.mean(x_ma, axis = 0).to_numpy()# Array. Mean for each of the M features
#         print(x_mi.corr())
#         print(np.cov(x_mi))
#         print(np.diagonal(x_mi.corr()))

#         print(x_mi.var())
        self.cov_mi = x_mi.var().to_numpy()*np.identity(number_features)
        self.cov_ma = x_ma.var().to_numpy()*np.identity(number_features)
#         print(self.cov_mi)

        if cost is not None:
            if self.prior_mi > self.prior_mi:
                self.prior_mi *= (1-cost)
                self.prior_ma *= cost 
            else:
                self.prior_mi *= cost
                self.prior_ma *= (1-cost)
    
    def predict(self, x):
        print(np.shape(self.mean_mi))
        print(self.mean_mi)
        print(np.shape(self.cov_mi))
        print(self.cov_mi)
        print(Normal.pdf(x, self.mean_ma, self.cov_ma))
        x_mi = self.prior_mi * Normal.pdf(x, self.mean_mi, self.cov_mi)
        x_ma = self.prior_ma * Normal.pdf(x, self.mean_ma, self.cov_ma)
        y_pred = np.zeros([len(x),1])
        for i in range(len(x)):
            if x_mi[i] > x_ma[i]:
                y_pred[i] = self.y_mi
            else:
                y_pred[i] = self.y_ma
        return y_pred

In [3]:
# Read data

data = pd.read_pickle("./data_no_cathegorical.pkl")

# data = data.iloc[:,1:12]
# data=data.drop(columns=['twitter_id'])

# split data into X and y
# X = data.loc[:, data.columns != 'bot']
# Y = data.loc[:, data.columns == 'bot'].to_numpy()

X = data.loc[:, data.columns != 'account_type_bot']
Y = data.loc[:, data.columns == 'account_type_bot'].to_numpy()

X = X.iloc[:,1:3]

# split data into train and test sets
seed = 10
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [4]:
data.head()

Unnamed: 0,favorites_count,followers_count,friends_count,statuses_count,average_tweets_per_day,account_age_days,default_profile_True,default_profile_image_True,geo_enabled_True,lang_af,...,lang_th,lang_tl,lang_tr,lang_uk,lang_ur,lang_vi,lang_zh-cn,lang_zh-tw,verified_True,account_type_bot
0,4,1589,4,11041,7.87,1403,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,536,860,880,252,0.183,1379,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3307,172,594,1001,0.864,1159,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,8433,517,633,1324,0.889,1489,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,88,753678,116,4202,1.339,3138,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [5]:
# data.iloc[:,1:12].head()
# data.iloc[:,12:60].head()

In [6]:
# Without changing any cost
NBClassifier = NaiveBayes_CostSensitive()
NBClassifier.fit(X_train,y_train,cost=None)

y_pred_bayes_base = NBClassifier.predict(X_test)
f1_base_bayes = f1_score(y_test, y_pred_bayes_base, average='weighted')
print(f1_base_bayes)

(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
0.6309136404408857


In [7]:
# Changing costs

costs = [i for i in list(np.linspace(0.1,0.9,11))]
f1scores_bayes = []

homog_it = 10

for c in tqdm(costs):
    aux_f1_b = np.zeros([1,homog_it])
    
    for i in range(homog_it): # Homogenize results
       
        NBClassifier = NaiveBayes_CostSensitive()
        NBClassifier.fit(X_train, y_train, cost = c)
         
        y_pred_bayes = NBClassifier.predict(X_test)
        
        aux_f1_b[0,i] = f1_score(y_test, y_pred_bayes, average='weighted')
        
    f1scores_bayes.append(np.mean(aux_f1_b))

  0%|          | 0/11 [00:00<?, ?it/s]

(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 .

  9%|▉         | 1/11 [00:00<00:02,  3.97it/s]

(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 .

 18%|█▊        | 2/11 [00:00<00:02,  4.20it/s]

[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 

 27%|██▋       | 3/11 [00:00<00:01,  4.59it/s]

(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 .

 36%|███▋      | 4/11 [00:00<00:01,  4.93it/s]

(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]


 45%|████▌     | 5/11 [00:00<00:01,  5.07it/s]

(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 .

 55%|█████▍    | 6/11 [00:01<00:01,  4.67it/s]

(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 .

 64%|██████▎   | 7/11 [00:01<00:00,  4.27it/s]

(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 .

 73%|███████▎  | 8/11 [00:01<00:00,  4.63it/s]

(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]


 82%|████████▏ | 9/11 [00:01<00:00,  4.86it/s]

(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 .

 91%|█████████ | 10/11 [00:02<00:00,  4.97it/s]

(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 .

100%|██████████| 11/11 [00:02<00:00,  4.99it/s]

(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 ... 1.63753624e-12
 1.58994409e-12 1.60604643e-12]
(2,)
[111391.25645342   7628.69622147]
(2, 2)
[[1.64988616e+12 0.00000000e+00]
 [0.00000000e+00 4.68814277e+09]]
[1.60732505e-12 1.60727034e-12 1.60536417e-12 .




In [10]:
f1scores_bayes

[0.6270643566405145,
 0.6285453681271845,
 0.629252392172431,
 0.6295742545484713,
 0.6303339478941598,
 0.6309136404408857,
 0.6326711050304751,
 0.634347859008572,
 0.287142292594183,
 0.21307359442696053,
 0.18318263441848726]