In [1]:
import pandas as pd
import numpy as np
import re
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

test_df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")
texts = test_df["text"].tolist()

In [2]:
# Using the tf-idf + linear regression implementation

def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text

# Custom text features
class TextFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.swear_list = {"fuck", "shit", "bitch", "damn", "ass", "crap", "dick"}
        return self

    def transform(self, X):
        features = pd.DataFrame()
        features["text_length"] = X.apply(len)
        features["word_count"] = X.apply(lambda x: len(x.split()))
        features["punct_count"] = X.apply(lambda x: sum(1 for char in x if char in "!?.,;:"))
        features["uppercase_ratio"] = X.apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x) + 1))
        features["swear_word_count"] = X.apply(lambda x: sum(1 for word in x.lower().split() if word in self.swear_list))
        return features.values

# Load and clean training data
train_df = pd.read_csv("/kaggle/input/train-ranking/train.csv")
train_df["clean_text"] = train_df["comment_text"].apply(clean_text)
label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
weights = [.32, 1.82, .16, 1.5, .64, 1.5]
train_df["severity"] = train_df[label_cols].dot(weights)

# Balance dataset
toxic_df = train_df[train_df["severity"] > 0]
non_toxic_df = train_df[train_df["severity"] == 0].sample(n=len(toxic_df), random_state=42)
balanced_df = pd.concat([toxic_df, non_toxic_df]).sample(frac=1.0, random_state=42).reset_index(drop=True)

# Build final pipeline with fixed best params
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    min_df=3,
    max_df=0.75,
    ngram_range=(1,1)
)

combined_features = FeatureUnion([
    ("tfidf", tfidf),
    ("extra_features", TextFeatures())
])

pipeline = Pipeline([
    ("features", combined_features),
    ("linear", LinearRegression(fit_intercept=False))
])

pipeline.fit(balanced_df["clean_text"], balanced_df["severity"])

# Predict on test set
test_df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")
test_df["clean_text"] = test_df["text"].apply(clean_text)
linear_scores = pipeline.predict(test_df["clean_text"])


In [3]:
# Using the BERT Summation implementation

model_path = "/kaggle/input/bert-toxic-epoch1/other/default/1"
tokenizer = AutoTokenizer.from_pretrained(model_path)
bert = AutoModelForSequenceClassification.from_pretrained(model_path)
bert.eval().to("cuda" if torch.cuda.is_available() else "cpu")

def get_bert_sum(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=256)
    inputs = {k: v.to(bert.device) for k, v in inputs.items()}
    with torch.no_grad():
        logits = bert(**inputs).logits
        probs = torch.sigmoid(logits).squeeze().cpu().numpy()
    weights = [.32, 1.82, .16, 1.5, .64, 1.5]
    return np.dot(probs, weights)

bert_scores = [get_bert_sum(text) for text in texts]

2025-04-20 21:19:23.823346: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745183964.016295      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745183964.072438      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
test_df["score"] = (linear_scores + bert_scores) / 2

In [5]:
submission = test_df[["comment_id", "score"]]
submission.to_csv("submission.csv", index=False)
print("submission complete!")

submission complete!
