In [1]:
import os
import zipfile


os.environ['KAGGLE_USERNAME'] = ""
os.environ['KAGGLE_KEY'] = ""
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

for filename in os.listdir():
    if filename.endswith(".zip"):
        zip_ref = zipfile.ZipFile(filename)
        zip_ref.extractall()

Downloading train.csv.zip to /content
 65% 17.0M/26.3M [00:00<00:00, 76.0MB/s]
100% 26.3M/26.3M [00:00<00:00, 87.7MB/s]
Downloading test_labels.csv.zip to /content
  0% 0.00/1.46M [00:00<?, ?B/s]
100% 1.46M/1.46M [00:00<00:00, 99.4MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/1.39M [00:00<?, ?B/s]
100% 1.39M/1.39M [00:00<00:00, 204MB/s]
Downloading test.csv.zip to /content
 38% 9.00M/23.4M [00:00<00:00, 92.6MB/s]
100% 23.4M/23.4M [00:00<00:00, 150MB/s] 


Рассмотрим данные из соревнования с kaggle [toxic comment classification challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge), тк задача хорошо нам подходит - классификация текстов. Будем классифицировать комментарии только по одному критерию - toxic/non-toxic.

In [2]:
import pandas as pd
import numpy as np
np.random.seed(0xFFFFFFF)

In [3]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
test_labels = pd.read_csv('./test_labels.csv')

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [5]:
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import f1_score

In [6]:
df = train.copy()
rslt_df = df[(df['toxic'] == 0) & (df['severe_toxic'] == 0) & (df['obscene'] == 0) & (df['threat'] == 0) & (df['insult'] == 0) & (df['identity_hate'] == 0)]
rslt_df2 = df[(df['toxic'] == 1) | (df['severe_toxic'] == 1) | (df['obscene'] == 1) | (df['threat'] == 1) | (df['insult'] == 1) | (df['identity_hate'] == 1)]
# Примерно сохраняем изначальную пропорцию классов
new1 = rslt_df[['id', 'comment_text', 'toxic']].iloc[:23891].copy() 
new2 = rslt_df2[['id', 'comment_text', 'toxic']].iloc[:946].copy()
new = pd.concat([new1, new2], ignore_index=True)

In [7]:
# Тк toxic комментариев мало, будем делить на трейн/тест выборки с сохранением соотношения.
X_train, X_test, y_train, y_test = train_test_split(new["comment_text"], new['toxic'], test_size=0.33, stratify=new['toxic'])

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.95, min_df=5)
X1 = vectorizer.fit_transform(X_train)
X_test1= vectorizer.transform(X_test)

Для начала сравним ядровой СВМ и логистическую регрессию. Сравнение будем проводить по доле верных ответов и метрике F1, тк классы несбалансированные.
Ядровой СВМ побеждает.

In [None]:
clf2 = svm.SVC(kernel='linear', C=1,probability=True)
y_p1 = clf2.fit(X1, y_train).predict(X_test1)
accuracy = accuracy_score(y_test, y_p1)
print('Accuracy: %f' % accuracy)
print('F1 score: %f' % f1_score(y_test, y_p1))

Accuracy: 0.976577
F1 score: 0.522388


In [None]:
clf4 = LogisticRegression()
clf4.fit(X1, y_train)
y_p1 = clf4.predict(X_test1)
accuracy = accuracy_score(y_test, y_p1)
print('Accuracy: %f' % accuracy)
print('F1 score: %f' % f1_score(y_test, y_p1))

Accuracy: 0.970233
F1 score: 0.290698


Теперь сравним с более сильным противником - градиентный бустинг. 

In [None]:
param_grid = {'learning_rate': np.logspace(-2., 0., num=8),
              'n_estimators':[150, 200, 250],
             }

clf3 = GridSearchCV(xgb.XGBClassifier(), param_grid, scoring="accuracy")

clf3.fit(X1, y_train)
print(clf3.best_params_)
y_pred = clf3.predict(X_test1)
print("xgboost accuracy:", accuracy_score(y_test, y_pred))
print("xgboost f1:", f1_score(y_test, y_pred))

{'learning_rate': 0.2682695795279725, 'n_estimators': 250}
xgboost accuracy: 0.9766987922410638
xgboost f1: 0.5352798053527981


Тк видим, что число базовых моделей получилось максимально возможным, проверим, что будет для бОльших значемний. Видим, что лучший результат получился хуже.

In [None]:
param_grid = {'learning_rate': np.logspace(-2., 0., num=8),
              'n_estimators':[300, 350],
             }

clf3 = GridSearchCV(xgb.XGBClassifier(), param_grid, scoring="accuracy")

clf3.fit(X1, y_train)
print(clf3.best_params_)
y_pred = clf3.predict(X_test1)
print("xgboost accuracy:", accuracy_score(y_test, y_pred))
print("xgboost f1:", f1_score(y_test, y_pred))

{'learning_rate': 0.2682695795279725, 'n_estimators': 350}
xgboost accuracy: 0.9762108088324997
xgboost f1: 0.5323741007194245


In [None]:
param_grid = {'C': np.logspace(-2., 1., num=6),
              'kernel':['sigmoid'],
              'probability':[True]
             }
clf5 = GridSearchCV(svm.SVC(), param_grid, scoring="accuracy")
clf5.fit(X1, y_train)
print(clf5.best_params_)
y_pred = clf5.predict(X_test1)
print("SVC accuracy:", accuracy_score(y_test, y_pred))
print("SVC f1:", f1_score(y_test, y_pred))

{'C': 2.5118864315095797, 'kernel': 'sigmoid', 'probability': True}
SVC accuracy: 0.9781627424667562
SVC f1: 0.5941043083900227


{'C': 2.5118864315095797, 'kernel': 'sigmoid', 'probability': True}
SVC accuracy: 0.9781627424667562
SVC f1: 0.5941043083900227

In [None]:
param_grid = {'C': np.logspace(-2., 1., num=8),
              'kernel':['linear'],
              'probability':[True]
             }
clf5 = GridSearchCV(svm.SVC(), param_grid, scoring="accuracy")
clf5.fit(X1, y_train)
print(clf5.best_params_)
y_pred = clf5.predict(X_test1)
print("SVC accuracy:", accuracy_score(y_test, y_pred))
print("SVC f1:", f1_score(y_test, y_pred))

{'C': 3.727593720314938, 'kernel': 'linear', 'probability': True}
SVC accuracy: 0.977918750762474
SVC f1: 0.6004415011037527


Получается, что линейное и сигмоидное ядро дают результат лучше, чем бустинг. Сравним время на оптимальных параметрах.

В нашем случае получилось, что ядровые методы медленнее, но дают заметно лучшее качество.

In [9]:
import time

start = time.time()
clf_svm = svm.SVC(C=3.727593720314938, kernel='linear', probability=True)
clf_svm.fit(X1, y_train)
y_pred = clf_svm.predict(X_test1)
print("SVC time:", time.time() - start)
print("SVC accuracy:", accuracy_score(y_test, y_pred))
print("SVC f1:", f1_score(y_test, y_pred))

SVC time: 105.98542904853821
SVC accuracy: 0.977918750762474
SVC f1: 0.6004415011037527


In [11]:
import time

start = time.time()
clf_svm = svm.SVC(C=2.5118864315095797, kernel='sigmoid', probability=True)
clf_svm.fit(X1, y_train)
y_pred = clf_svm.predict(X_test1)
print("SVC time:", time.time() - start)
print("SVC accuracy:", accuracy_score(y_test, y_pred))
print("SVC f1:", f1_score(y_test, y_pred))

SVC time: 82.99676513671875
SVC accuracy: 0.9781627424667562
SVC f1: 0.5941043083900227


In [10]:
start = time.time()
clf_xgb = xgb.XGBClassifier(learning_rate=0.2682695795279725, n_estimators=250)
clf_xgb.fit(X1, y_train)
y_pred = clf_xgb.predict(X_test1)
print("XGB time:", time.time() - start)
print("XGB accuracy:", accuracy_score(y_test, y_pred))
print("XGB f1:", f1_score(y_test, y_pred))

XGB time: 27.400084018707275
XGB accuracy: 0.9766987922410638
XGB f1: 0.5352798053527981
