In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
from tqdm.auto import tqdm
import pandas as pd
from sklearn.utils import shuffle

## Классификаторы

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score, average_precision_score

In [None]:
def svc_classifier(X_train, y_train, X_test, y_test):

    y_train = y_train.to_numpy().ravel()
    y_test = y_test.to_numpy().ravel()

    svc = LinearSVC(dual=False)
    reg_Cs = np.logspace(-5, 1, 20)
    linear_svc = GridSearchCV(svc, {"C": reg_Cs}, cv=10)    # chooses best by score estimate
    model = linear_svc.fit(X_train, y_train)

    best_model_svc = linear_svc.best_estimator_
    train_score = best_model_svc.score(X_train, y_train)
    test_score = best_model_svc.score(X_test, y_test)

    b_pred_svc = best_model_svc.decision_function(X_test)
    auc_roc_svc_ = roc_auc_score(y_test, b_pred_svc)
    auc_pr_svc_ = average_precision_score(y_test, b_pred_svc)

    return best_model_svc, {
        'train_acc': train_score, 'test_acc': test_score,
        'auc_roc_test': auc_roc_svc_, 'auc_pr_test': auc_pr_svc_
    }

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, roc_auc_score, average_precision_score)

In [None]:
def decision_tree_simple(X_train_full, y_train, X_test_full, y_test, df, bots, max_depth=2):
  for i in range(X_train_full.shape[1]):
    X_train = X_train_full[[X_train_full.columns[i]]]
    X_test = X_test_full[[X_test_full.columns[i]]]
    model = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    y_proba_train = model.predict_proba(X_train)[:, 1]
    y_proba_test = model.predict_proba(X_test)[:, 1]

    df.loc[i, f'{bots}_tr_acc'] = accuracy_score(y_train, y_pred_train)
    df.loc[i, f'{bots}_tst_acc'] = accuracy_score(y_test, y_pred_test)
    df.loc[i, f'{bots}_auc_roc'] = roc_auc_score(y_test, y_proba_test)
    df.loc[i, f'{bots}_auc_pr'] = average_precision_score(y_test, y_proba_test)


### Create datasets

## Contstant train-test

Все классификаторы обучаются и тестируются на одной и той же выборке

In [None]:
def train_test_files(input_file, lit=0, random_state=42):
  data = pd.read_csv(input_file)
  data['lit'] = lit
  if lit == 1:
    lit_sample = data.sample(n=1400, random_state=random_state)
    # не должны пересекаться
    train = lit_sample.iloc[:1000]
    test = lit_sample.iloc[1000:]
  else:
    # могут пересекаться
    train = data.sample(n=450, random_state=random_state)
    test = data.sample(n=200, random_state=random_state)

  file_output = '_'.join(input_file.split('/')[-1].split('_')[1:3])
  train.to_csv(f'train_test/{file_output}_train.csv', index=False)
  test.to_csv(f'train_test/{file_output}_test.csv', index=False)


In [None]:
train_test_files('/content/RU_bigbalaboba_diam995_features.csv', lit=0)

In [None]:
train_test_files('/content/RU_biggpt2_diam995_features.csv', lit=0)
train_test_files('/content/RU_bigmGPT_diam995_features.csv', lit=0)
train_test_files('/content/RU_newlstm_diam995_features.csv', lit=0)


In [None]:
train_test_files('/content/RU_lit_diam995_features.csv', lit=1)

In [None]:
d = pd.read_csv('/content/train_test/lit_diam995_test.csv')
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 22 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       400 non-null    float64
 1   1       400 non-null    float64
 2   2       400 non-null    float64
 3   3       400 non-null    float64
 4   4       400 non-null    float64
 5   5       400 non-null    float64
 6   6       400 non-null    float64
 7   7       400 non-null    float64
 8   8       400 non-null    float64
 9   9       400 non-null    float64
 10  10      400 non-null    float64
 11  11      400 non-null    float64
 12  12      400 non-null    float64
 13  13      400 non-null    float64
 14  14      400 non-null    float64
 15  15      400 non-null    float64
 16  16      400 non-null    float64
 17  17      400 non-null    float64
 18  18      400 non-null    float64
 19  19      400 non-null    float64
 20  text    400 non-null    object 
 21  lit     400 non-null    int64  
dtypes:

In [None]:
from sklearn.utils import shuffle

In [None]:
bot_names = ['newlstm', 'bigbalaboba', 'biggpt2', 'bigmGPT']
def get_train_test_datasets(bot_subset=('bigmGPT', 'newlstm')):

  def read_subdf(file, type='bot'):
    d = pd.read_csv(file)
    d = d.drop(columns=['text'])
    if type == 'lit':
      d = d[:900]
    return d

  lit_train = read_subdf('/content/train_test/lit_diam995_train.csv', 'lit')
  lit_test = read_subdf('/content/train_test/lit_diam995_test.csv', 'lit')

  bot_train = []
  bot_test = []
  for bot in bot_names:
    df_train = read_subdf(f'/content/train_test/{bot}_diam995_train.csv')
    df_test = read_subdf(f'/content/train_test/{bot}_diam995_test.csv')
    if bot in bot_subset:
      bot_train.append(df_train)
      # print(f'{bot} added to train')
    else:
      bot_test.append(df_test)
      # print(f'{bot} added to test')

    train = pd.concat([lit_train] + bot_train).reset_index(drop=True)
    train = shuffle(train, random_state=13)
    test = pd.concat([lit_test] + bot_test).reset_index(drop=True)
    test = shuffle(test, random_state=25)

  return (
      train.reset_index(drop=True),
      test.reset_index(drop=True)
  )


In [None]:
train_df, test_df = get_train_test_datasets()

newlstm added to train
bigbalaboba added to test
biggpt2 added to test
bigmGPT added to train


In [None]:
def process_subset(bot_subset, method, df=None, cut_columns=None, printout=True):
  '''
  bot_subset: кортеж ботов для обучающей выборки
  method: тип классификатора dt/svc
  df: датафрейм для записи результатов (только dt)
  cut_columns: список строк-названий стобцов, которые надо исключить

  '''
  train_df, test_df = get_train_test_datasets(bot_subset)

  X_train_full = train_df[train_df.columns[:-1]]
  y_train = train_df[[train_df.columns[-1]]]

  X_test_full = test_df[test_df.columns[:-1]]
  y_test = test_df[[test_df.columns[-1]]]

  if cut_columns:
    X_train_full = X_train_full.drop(columns=cut_columns)
    X_test_full = X_test_full.drop(columns=cut_columns)


  if method == 'dt':
    decision_tree_simple(X_train_full, y_train, X_test_full, y_test, df=df,
                        bots='_'.join(list(bot_subset)),
                        max_depth=2)

  elif method == 'svc':
    result = svc_classifier(X_train_full, y_train, X_test_full, y_test)
    if printout:
      print('*' * 15)
      print(bot_subset)
      print(result[1])
    else:
      bots = '_'.join(list(bot_subset))
      return bots, result

### Обучение классификаторов на одном признаке

In [None]:
from itertools import combinations

In [None]:
pairs = list(combinations(bot_names, r=2))

In [None]:
all_metrics = pd.DataFrame()

In [None]:
for pair in pairs:
  process_subset(pair, all_metrics)

all_metrics

Unnamed: 0,newlstm_bigbalaboba_tr_acc,newlstm_bigbalaboba_tst_acc,newlstm_bigbalaboba_auc_roc,newlstm_bigbalaboba_auc_pr,newlstm_biggpt2_tr_acc,newlstm_biggpt2_tst_acc,newlstm_biggpt2_auc_roc,newlstm_biggpt2_auc_pr,newlstm_bigmGPT_tr_acc,newlstm_bigmGPT_tst_acc,...,bigbalaboba_biggpt2_auc_roc,bigbalaboba_biggpt2_auc_pr,bigbalaboba_bigmGPT_tr_acc,bigbalaboba_bigmGPT_tst_acc,bigbalaboba_bigmGPT_auc_roc,bigbalaboba_bigmGPT_auc_pr,biggpt2_bigmGPT_tr_acc,biggpt2_bigmGPT_tst_acc,biggpt2_bigmGPT_auc_roc,biggpt2_bigmGPT_auc_pr
0,0.768333,0.6475,0.729972,0.759721,0.765,0.7775,0.852494,0.815243,0.779444,0.76125,...,0.731184,0.662564,0.839444,0.67875,0.765097,0.751182,0.861667,0.58125,0.675719,0.624569
1,0.809444,0.6475,0.722797,0.729959,0.772222,0.70875,0.753603,0.724382,0.777222,0.69375,...,0.705356,0.719366,0.757778,0.71125,0.741,0.75202,0.733889,0.62875,0.743362,0.772424
2,0.623333,0.56625,0.544656,0.519807,0.567778,0.59125,0.591984,0.576645,0.592778,0.5725,...,0.525141,0.531158,0.671667,0.52375,0.549378,0.540756,0.665,0.525,0.565631,0.559447
3,0.650556,0.55625,0.56575,0.557008,0.618889,0.4325,0.501775,0.551796,0.632778,0.555,...,0.592675,0.619835,0.676111,0.46625,0.498084,0.522893,0.586667,0.5175,0.580184,0.601803
4,0.78,0.62625,0.695222,0.714852,0.731667,0.7475,0.761359,0.746078,0.747222,0.7075,...,0.713638,0.723815,0.759444,0.74375,0.780584,0.768677,0.761111,0.70875,0.702137,0.735979
5,0.823889,0.8625,0.862581,0.842724,0.823333,0.84,0.8756,0.860834,0.836667,0.82375,...,0.8782,0.872389,0.875,0.79375,0.85625,0.839085,0.889444,0.7025,0.824519,0.797175
6,0.778889,0.6025,0.655672,0.649646,0.765,0.635,0.669269,0.619625,0.728333,0.66,...,0.662725,0.657604,0.700556,0.69,0.706269,0.714667,0.671667,0.72125,0.724319,0.739078
7,0.714444,0.5675,0.589309,0.579587,0.671111,0.63375,0.639028,0.601408,0.655556,0.585,...,0.579031,0.560492,0.662778,0.59,0.634319,0.639326,0.626667,0.65375,0.699397,0.674425
8,0.607778,0.4925,0.510297,0.510825,0.674444,0.445,0.44965,0.476307,0.621667,0.52125,...,0.511725,0.526546,0.595,0.455,0.435928,0.462075,0.529444,0.5625,0.562063,0.550201
9,0.833333,0.70125,0.768028,0.740652,0.814444,0.755,0.79825,0.764032,0.786111,0.6925,...,0.776906,0.764406,0.76,0.76,0.799753,0.80346,0.760556,0.8125,0.816781,0.834581


In [None]:
all_metrics.to_csv('single_feature_classifiers.csv', index=False)

### Обучение классификаторов на полном и отфильрованном наборе признаков

In [None]:
a = process_subset(('newlstm', 'bigbalaboba'), 'svc', printout=False)
a[1][1]

{'train_acc': 0.9305555555555556,
 'test_acc': 0.93875,
 'auc_roc_test': np.float64(0.9788500000000001),
 'auc_pr_test': np.float64(0.9697489894441635)}

In [None]:
process_subset(('newlstm', 'bigbalaboba'), 'svc',
               cut_columns=['2', '3', '7', '8', '10', '12', '14', '15', '16', '17', '18', '19'])

***************
('newlstm', 'bigbalaboba')
{'train_acc': 0.9177777777777778, 'test_acc': 0.94, 'auc_roc_test': np.float64(0.98378125), 'auc_pr_test': np.float64(0.9667411565248483)}


In [None]:
pairs

[('newlstm', 'bigbalaboba'),
 ('newlstm', 'biggpt2'),
 ('newlstm', 'bigmGPT'),
 ('bigbalaboba', 'biggpt2'),
 ('bigbalaboba', 'bigmGPT'),
 ('biggpt2', 'bigmGPT')]

In [None]:
cut = ['2', '3', '7', '8', '10', '12', '14', '15', '16', '17', '18', '19']
svc_res = pd.DataFrame()

for pair in pairs:
  full_res = process_subset(pair, 'svc', printout=False)
  bots1 = full_res[0]
  metrics1 = full_res[1][1]
  svc_res.loc[0, f'{bots1}_tr_acc'] = metrics1['train_acc']
  svc_res.loc[0, f'{bots1}_tst_acc'] = metrics1['test_acc']
  svc_res.loc[0, f'{bots1}_auc_roc'] = metrics1['auc_roc_test']
  svc_res.loc[0, f'{bots1}_auc_pr'] = metrics1['auc_pr_test']

  cut_res = process_subset(pair, 'svc', cut_columns=cut, printout=False)
  bots2 = cut_res[0]
  metrics2 = cut_res[1][1]
  svc_res.loc[1, f'{bots1}_tr_acc'] = metrics2['train_acc']
  svc_res.loc[1, f'{bots1}_tst_acc'] = metrics2['test_acc']
  svc_res.loc[1, f'{bots1}_auc_roc'] = metrics2['auc_roc_test']
  svc_res.loc[1, f'{bots1}_auc_pr'] = metrics2['auc_pr_test']

svc_res

Unnamed: 0,newlstm_bigbalaboba_tr_acc,newlstm_bigbalaboba_tst_acc,newlstm_bigbalaboba_auc_roc,newlstm_bigbalaboba_auc_pr,newlstm_biggpt2_tr_acc,newlstm_biggpt2_tst_acc,newlstm_biggpt2_auc_roc,newlstm_biggpt2_auc_pr,newlstm_bigmGPT_tr_acc,newlstm_bigmGPT_tst_acc,...,bigbalaboba_biggpt2_auc_roc,bigbalaboba_biggpt2_auc_pr,bigbalaboba_bigmGPT_tr_acc,bigbalaboba_bigmGPT_tst_acc,bigbalaboba_bigmGPT_auc_roc,bigbalaboba_bigmGPT_auc_pr,biggpt2_bigmGPT_tr_acc,biggpt2_bigmGPT_tst_acc,biggpt2_bigmGPT_auc_roc,biggpt2_bigmGPT_auc_pr
0,0.930556,0.93875,0.97885,0.969749,0.926111,0.92875,0.980019,0.981745,0.921667,0.93125,...,0.946844,0.951312,0.957222,0.7975,0.935037,0.921324,0.961667,0.7725,0.920631,0.933312
1,0.917778,0.94,0.983781,0.966741,0.915,0.9325,0.978525,0.963124,0.909444,0.9425,...,0.960963,0.959718,0.956667,0.85125,0.953662,0.942944,0.96,0.81375,0.943375,0.954941


In [None]:
svc_res = svc_res.T
svc_res.head()

Unnamed: 0,0,1
newlstm_bigbalaboba_tr_acc,0.930556,0.917778
newlstm_bigbalaboba_tst_acc,0.93875,0.94
newlstm_bigbalaboba_auc_roc,0.97885,0.983781
newlstm_bigbalaboba_auc_pr,0.969749,0.966741
newlstm_biggpt2_tr_acc,0.926111,0.915


In [None]:
svc_res.to_csv('svc_compare.csv')

## Extra

In [None]:
def prepare(hole_type):
  balaboba = pd.read_csv(f'/content/RU_bigbalaboba_{hole_type}_features.csv')
  gpt2 = pd.read_csv(f'/content/RU_biggpt2_{hole_type}_features.csv')
  mgpt = pd.read_csv(f'/content/RU_bigmGPT_{hole_type}_features.csv')
  lstm = pd.read_csv(f'/content/RU_newlstm_{hole_type}_features.csv')
  lit = pd.read_csv(f'/content/RU_lit_{hole_type}_features.csv')

  balaboba['lit'] = 0
  gpt2['lit'] = 0
  mgpt['lit'] = 0
  lstm['lit'] = 0
  lit['lit'] = 1

  datas = {'balaboba': balaboba,
         'gpt2': gpt2,
         'mgpt': mgpt,
         'lstm': lstm,
         'lit': lit}

  return balaboba, gpt2, mgpt, lstm, lit, datas

In [None]:
def create_train_test_full(train_bots=['lstm', 'gpt2'], test_bots=['mgpt', 'balaboba'], random_state=42):
  lit_sample = lit.sample(n=1400, random_state=random_state)

  train_0 = lit_sample.iloc[:1000]
  train_1 = datas[train_bots[0]].sample(n=500, random_state=random_state)
  train_2 = datas[train_bots[1]].sample(n=500, random_state=random_state)
  train = pd.concat([train_0, train_1, train_2], ignore_index=True)

  test_0 = lit_sample.iloc[1000:]
  test_1 = datas[test_bots[0]].sample(n=200, random_state=random_state)
  test_2 = datas[test_bots[1]].sample(n=200, random_state=random_state)
  test = pd.concat([test_0, test_1, test_2], ignore_index=True)

  X_train = train.iloc[:, :-2]
  y_train = train.iloc[:, [-1]]
  X_test = test.iloc[:, :-2]
  y_test = test.iloc[:, [-1]]

  X_train, y_train = shuffle(X_train, y_train)
  X_test, y_test = shuffle(X_test, y_test)

  return X_train, y_train, X_test, y_test

In [None]:
balaboba, gpt2, mgpt, lstm, lit, datas = prepare('syn')

In [None]:
X_train, y_train, X_test, y_test = create_train_test_full(train_bots=['lstm', 'gpt2'], test_bots=['mgpt', 'balaboba'], random_state=42)

In [None]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
X_train_mod = X_train.drop(columns=['2','5','8'])

In [None]:
X_test_mod = X_test.drop(columns=['2','5','8'])

In [None]:
svc_res = svc_classifier(X_train, y_train, X_test, y_test)
svc_res

(LinearSVC(C=np.float64(10.0), dual=False),
 {'train_acc': 0.8785,
  'test_acc': 0.76,
  'auc_roc_test': np.float64(0.8390437500000001),
  'auc_pr_test': np.float64(0.8409145314363309)})

In [None]:
svc_res = svc_classifier(X_train_mod, y_train, X_test_mod, y_test)
svc_res

(LinearSVC(C=np.float64(10.0), dual=False),
 {'train_acc': 0.8765,
  'test_acc': 0.755,
  'auc_roc_test': np.float64(0.8429937499999999),
  'auc_pr_test': np.float64(0.8457811529278407)})

In [None]:
X_train_mod2 = X_train.drop(columns=['2','5','8','9','10'])
X_test_mod2 = X_test.drop(columns=['2','5','8','9','10'])
svc_res = svc_classifier(X_train_mod2, y_train, X_test_mod2, y_test)
svc_res

(LinearSVC(C=np.float64(10.0), dual=False),
 {'train_acc': 0.875,
  'test_acc': 0.76125,
  'auc_roc_test': np.float64(0.84770625),
  'auc_pr_test': np.float64(0.8518510047138992)})

In [None]:
X_train_mod3 = X_train.drop(columns=['0','1','3','4','6', '7', '9', '10'])
X_test_mod3 = X_test.drop(columns=['0','1','3','4','6', '7', '9', '10'])
svc_res = svc_classifier(X_train_mod3, y_train, X_test_mod3, y_test)
svc_res

(LinearSVC(C=np.float64(10.0), dual=False),
 {'train_acc': 0.8205,
  'test_acc': 0.65875,
  'auc_roc_test': np.float64(0.74415625),
  'auc_pr_test': np.float64(0.762609631469704)})

## km=10

In [None]:
balaboba, gpt2, mgpt, lstm, lit, datas = prepare('syn10')

In [None]:
X_train, y_train, X_test, y_test = create_train_test_full(train_bots=['lstm', 'gpt2'], test_bots=['mgpt', 'balaboba'], random_state=42)

In [None]:
svc_res = svc_classifier(X_train, y_train, X_test, y_test)
svc_res

(LinearSVC(C=np.float64(10.0), dual=False),
 {'train_acc': 0.8335,
  'test_acc': 0.71625,
  'auc_roc_test': np.float64(0.81213125),
  'auc_pr_test': np.float64(0.8166834460729557)})

In [None]:
X_train_mod = X_train.drop(columns=['2','5','8'])
X_test_mod = X_test.drop(columns=['2','5','8'])
svc_res = svc_classifier(X_train_mod, y_train, X_test_mod, y_test)
svc_res

(LinearSVC(C=np.float64(10.0), dual=False),
 {'train_acc': 0.835,
  'test_acc': 0.7175,
  'auc_roc_test': np.float64(0.8154062500000001),
  'auc_pr_test': np.float64(0.82142912869362)})

In [None]:
X_train_mod2 = X_train.drop(columns=['2','5','8','9','10'])
X_test_mod2 = X_test.drop(columns=['2','5','8','9','10'])
svc_res = svc_classifier(X_train_mod2, y_train, X_test_mod2, y_test)
svc_res

(LinearSVC(C=np.float64(4.832930238571752), dual=False),
 {'train_acc': 0.818,
  'test_acc': 0.67125,
  'auc_roc_test': np.float64(0.7782687500000001),
  'auc_pr_test': np.float64(0.7818687400199609)})