In [10]:
# !pip install xgboost

In [11]:
import numpy as np
import pandas as pd
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import catboost
import xgboost

In [24]:
import glob

path = './data/tweets_partials/'
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files[:20]:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

In [25]:
df.shape

(20000, 1413)

In [26]:
df.shape

(20000, 1413)

In [27]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(['target'], axis=1), df['target'])

In [28]:
x_train.shape

(15000, 1412)

**xgboost**

In [29]:
clf = xgboost.XGBClassifier(random_state=42)
clf.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [30]:
pred = clf.predict(x_train)
print(classification_report(pred, y_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7682
           1       1.00      1.00      1.00      7318

    accuracy                           1.00     15000
   macro avg       1.00      1.00      1.00     15000
weighted avg       1.00      1.00      1.00     15000



In [31]:
pred = clf.predict(x_test)
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.80      0.74      0.77      2757
           1       0.71      0.77      0.74      2243

    accuracy                           0.75      5000
   macro avg       0.75      0.75      0.75      5000
weighted avg       0.76      0.75      0.75      5000



**catboost**

In [32]:
clf2 = catboost.CatBoostClassifier(n_estimators=100, random_state=42)
clf2.fit(x_train, y_train)

Learning rate set to 0.270474
0:	learn: 0.5548228	total: 128ms	remaining: 12.7s
1:	learn: 0.5053599	total: 315ms	remaining: 15.4s
2:	learn: 0.4938241	total: 468ms	remaining: 15.1s
3:	learn: 0.4770702	total: 619ms	remaining: 14.9s
4:	learn: 0.4659870	total: 780ms	remaining: 14.8s
5:	learn: 0.4609031	total: 934ms	remaining: 14.6s
6:	learn: 0.4565888	total: 1.08s	remaining: 14.4s
7:	learn: 0.4522134	total: 1.24s	remaining: 14.3s
8:	learn: 0.4485193	total: 1.39s	remaining: 14s
9:	learn: 0.4452880	total: 1.55s	remaining: 13.9s
10:	learn: 0.4420112	total: 1.71s	remaining: 13.9s
11:	learn: 0.4386577	total: 1.89s	remaining: 13.9s
12:	learn: 0.4353981	total: 2.05s	remaining: 13.7s
13:	learn: 0.4325643	total: 2.2s	remaining: 13.5s
14:	learn: 0.4294444	total: 2.36s	remaining: 13.4s
15:	learn: 0.4266016	total: 2.51s	remaining: 13.2s
16:	learn: 0.4240685	total: 2.65s	remaining: 13s
17:	learn: 0.4208205	total: 2.79s	remaining: 12.7s
18:	learn: 0.4189336	total: 2.95s	remaining: 12.6s
19:	learn: 0.416

<catboost.core.CatBoostClassifier at 0x7fede379e8d0>

In [33]:
pred = clf2.predict(x_train)
print(classification_report(pred, y_train))

              precision    recall  f1-score   support

           0       0.96      0.90      0.93      8138
           1       0.89      0.95      0.92      6862

    accuracy                           0.92     15000
   macro avg       0.92      0.93      0.92     15000
weighted avg       0.93      0.92      0.92     15000



In [34]:
pred = clf2.predict(x_test)
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.80      0.74      0.77      2762
           1       0.71      0.76      0.73      2238

    accuracy                           0.75      5000
   macro avg       0.75      0.75      0.75      5000
weighted avg       0.76      0.75      0.75      5000



**DNN**

In [15]:
# !pip install keras
# !pip install tensorflow

In [33]:
from keras import Sequential
from keras.layers import Dense, Dropout
from sklearn.preprocessing import Normalizer

In [34]:
normalizer = Normalizer().fit(x_train)

In [44]:
import pickle

with open('./msg_normalizer.pickle', 'wb') as f:
    pickle.dump(normalizer, f)

In [35]:
classifier = Sequential()
classifier.add(Dense(100, activation='relu', kernel_initializer='random_normal'))
classifier.add(Dense(100, activation='relu', kernel_initializer='random_normal'))
classifier.add(Dropout(0.5))
classifier.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))

In [36]:
classifier.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])

In [37]:
classifier.fit(normalizer.transform(x_train), y_train, batch_size=100, epochs=7)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tensorflow.python.keras.callbacks.History at 0x7fb5b00c12b0>

In [38]:
pred = classifier.predict_classes(normalizer.transform(x_train))
print(classification_report(pred, y_train))

              precision    recall  f1-score   support

           0       0.88      0.77      0.82     21858
           1       0.72      0.85      0.78     15642

    accuracy                           0.80     37500
   macro avg       0.80      0.81      0.80     37500
weighted avg       0.81      0.80      0.80     37500



In [39]:
pred = classifier.predict_classes(normalizer.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.85      0.74      0.79      7358
           1       0.69      0.81      0.74      5142

    accuracy                           0.77     12500
   macro avg       0.77      0.78      0.77     12500
weighted avg       0.78      0.77      0.77     12500



In [38]:
from typing import List, Tuple

In [39]:
def create_bootstrap_samples(data: np.array, n_samples: int = 1000) -> np.array:
    """
    Создание бутстреп-выборок.

    Parameters
    ----------
    data: np.array
        Исходная выборка, которая будет использоваться для
        создания бутстреп выборок.

    n_samples: int, optional, default = 1000
        Количество создаваемых бутстреп выборок.
        Опциональный параметр, по умолчанию, равен 1000.

    Returns
    -------
    bootstrap_idx: np.array
        Матрица индексов, для создания бутстреп выборок.

    """
    bootstrap_idx = np.random.randint(
        low=0, high=len(data), size=(n_samples, len(data))
    )
    return bootstrap_idx


def create_bootstrap_metrics(y_true: np.array,
                             y_pred: np.array,
                             metric: callable,
                             n_samlpes: int = 1000) -> List[float]:
    """
    Вычисление бутстреп оценок.

    Parameters
    ----------
    y_true: np.array
        Вектор целевой переменной.

    y_pred: np.array
        Вектор прогнозов.

    metric: callable
        Функция для вычисления метрики.
        Функция должна принимать 2 аргумента: y_true, y_pred.

    n_samples: int, optional, default = 1000
        Количество создаваемых бутстреп выборок.
        Опциональный параметр, по умолчанию, равен 1000.

    Returns
    -------
    bootstrap_metrics: List[float]
        Список со значениями метрики качества на каждой бустреп выборке.

    """
    scores = []

    if isinstance(y_true, pd.Series):
        y_true = y_true.values

    bootstrap_idx = create_bootstrap_samples(y_true)
    for idx in bootstrap_idx:
        y_true_bootstrap = y_true[idx]
        y_pred_bootstrap = y_pred[idx]

        score = metric(y_true_bootstrap, y_pred_bootstrap)
        scores.append(score)

    return scores


def calculate_confidence_interval(scores: list, conf_interval: float = 0.95) -> Tuple[float]:
    """
    Вычисление доверительного интервала.

    Parameters
    ----------
    scores: List[float / int]
        Список с оценками изучаемой величины.

    conf_interval: float, optional, default = 0.95
        Уровень доверия для построения интервала.
        Опциональный параметр, по умолчанию, равен 0.95.

    Returns
    -------
    conf_interval: Tuple[float]
        Кортеж с границами доверительного интервала.

    """
    left_bound = np.percentile(
        scores, ((1 - conf_interval) / 2) * 100
    )
    right_bound = np.percentile(
        scores, (conf_interval + ((1 - conf_interval) / 2)) * 100
    )
    
    return left_bound, right_bound

In [40]:
scores = create_bootstrap_metrics(y_test, clf2.predict(x_test), roc_auc_score)

print(calculate_confidence_interval(scores))

(0.7386304795108573, 0.7624090920300678)


In [41]:
import pickle

In [42]:
with open('./msg_classifier.pickle', 'wb') as f:
    pickle.dump(clf2, f)

In [44]:
clf2.predict_proba

<bound method CatBoostClassifier.predict_proba of <catboost.core.CatBoostClassifier object at 0x7fede379e8d0>>