In [6]:
import zipfile
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
import numpy as np

In [7]:
base_dir = "/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip"

with zipfile.ZipFile(base_dir, 'r') as z:
    z.extractall()

In [8]:
df = pd.read_csv("./train.csv")

In [9]:
tfidf_vec = TfidfVectorizer(max_features=10000, sublinear_tf=True,
                            strip_accents='unicode', analyzer='word',
                            ngram_range=(1, 1), token_pattern=r'\w{1,}',
                            lowercase=True, stop_words='english')

In [10]:
comments_vec_train = tfidf_vec.fit_transform(df['comment_text'])

In [13]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [16]:
svc = LinearSVC(random_state=42)

In [46]:
param = {"C": [0.01, 0.1, 1, 5, 10, 100]}

gs_svc = GridSearchCV(svc,
                      param_grid=param,
                      scoring='f1',
                      n_jobs=4,
                      cv=kf)

__Toxic__

In [47]:
X = comments_vec_train
y = df["toxic"]

gs_svc.fit(X, y)

print(f"Best param: {gs_svc.best_params_}")
print(f"F1-score: {gs_svc.score(X, y)}")

Best param: {'C': 1}
F1-score: 0.825983571119758


__Severe toxic__

In [48]:
X = comments_vec_train
y = df["severe_toxic"]

gs_svc.fit(X, y)

print(f"Best param: {gs_svc.best_params_}")
print(f"F1-score: {gs_svc.score(X, y)}")

Best param: {'C': 5}
F1-score: 0.7299474605954465


__Obscene__

In [49]:
X = comments_vec_train
y = df["obscene"]

gs_svc.fit(X, y)

print(f"Best param: {gs_svc.best_params_}")
print(f"F1-score: {gs_svc.score(X, y)}")

Best param: {'C': 1}
F1-score: 0.8605754030983244


__Threat__

In [50]:
X = comments_vec_train
y = df["threat"]

gs_svc.fit(X, y)

print(f"Best param: {gs_svc.best_params_}")
print(f"F1-score: {gs_svc.score(X, y)}")

Best param: {'C': 5}
F1-score: 0.924892703862661


__Insult__

In [51]:
X = comments_vec_train
y = df["insult"]

gs_svc.fit(X, y)

print(f"Best param: {gs_svc.best_params_}")
print(f"F1-score: {gs_svc.score(X, y)}")

Best param: {'C': 1}
F1-score: 0.764926164064156


__Identity hate__

In [52]:
X = comments_vec_train
y = df["identity_hate"]

gs_svc.fit(X, y)

print(f"Best param: {gs_svc.best_params_}")
print(f"F1-score: {gs_svc.score(X, y)}")

Best param: {'C': 5}
F1-score: 0.8304953560371517
