In [113]:
import zipfile
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np

In [2]:
base_dir = "/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip"

with zipfile.ZipFile(base_dir, 'r') as z:
    z.extractall()

In [4]:
df = pd.read_csv("./train.csv")

In [7]:
tfidf_vec = TfidfVectorizer(max_features=30000, sublinear_tf=True,
                            strip_accents='unicode', analyzer='word',
                            ngram_range=(1, 1), token_pattern=r'\w{1,}',
                            stop_words='english')
comments_vec_train = tfidf_vec.fit_transform(df['comment_text'])

In [11]:
comments_vec_train

<159571x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 3967801 stored elements in Compressed Sparse Row format>

### Default F1-score

In [35]:
dict_f1_scores = dict()
class_ = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

for c in class_:
    X = comments_vec_train
    y = df.loc[:, c]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    reg = LogisticRegression(random_state=42).fit(X_train, y_train)
    
    dict_f1_scores[c] = f1_score(reg.predict(X_test), y_test)

In [36]:
dict_f1_scores

{'toxic': 0.7152173913043479,
 'severe_toxic': 0.33608815426997246,
 'obscene': 0.73542600896861,
 'threat': 0.2594594594594594,
 'insult': 0.6094235828749103,
 'identity_hate': 0.24156305506216696}

### Best `p` value

In [101]:
def best_p_for_f1_score(X, y, steps):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    reg = LogisticRegression(random_state=42).fit(X_train, y_train)
    
    max_ = float('-inf')
    res = 0
    for p in steps:
        val = f1_score(reg.predict_proba(X_test)[:, 1] > p, y_test)
        if max_ < val:
            max_ = val
            res = p
    return {"p": res, "F1-score": max_}

In [102]:
y = df.loc[:, "toxic"]
steps = np.linspace(0.1, 0.5, 100)
best_p_for_f1_score(comments_vec_train, y, steps)

{'p': 0.2777777777777778, 'F1-score': 0.774173294265383}

In [103]:
y = df.loc[:, "severe_toxic"]
steps = np.linspace(0.01, 0.5, 100)
best_p_for_f1_score(comments_vec_train, y, steps)

{'p': 0.09414141414141414, 'F1-score': 0.4788732394366197}

In [106]:
y = df.loc[:, "obscene"]
steps = np.linspace(0.1, 0.5, 100)
best_p_for_f1_score(comments_vec_train, y, steps)

{'p': 0.15252525252525254, 'F1-score': 0.8045335658238885}

In [109]:
y = df.loc[:, "threat"]
steps = np.linspace(0.01, 0.5, 200)
best_p_for_f1_score(comments_vec_train, y, steps)

{'p': 0.08633165829145728, 'F1-score': 0.4507042253521127}

In [111]:
y = df.loc[:, "insult"]
steps = np.linspace(0.1, 0.5, 200)
best_p_for_f1_score(comments_vec_train, y, steps)

{'p': 0.15427135678391962, 'F1-score': 0.7154241190012397}

In [112]:
y = df.loc[:, "identity_hate"]
steps = np.linspace(0.01, 0.5, 300)
best_p_for_f1_score(comments_vec_train, y, steps)

{'p': 0.09521739130434782, 'F1-score': 0.4437927663734115}