In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import re

In [2]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text

In [3]:
class TextFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.swear_list = {"fuck", "shit", "bitch", "damn", "ass", "crap", "dick"}
        return self

    def transform(self, X):
        features = pd.DataFrame()
        features["text_length"] = X.apply(len)
        features["word_count"] = X.apply(lambda x: len(x.split()))
        features["punct_count"] = X.apply(lambda x: sum(1 for char in x if char in "!?.,;:"))
        features["uppercase_ratio"] = X.apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x) + 1))
        features["swear_word_count"] = X.apply(lambda x: sum(1 for word in x.lower().split() if word in self.swear_list))
        return features.values

In [4]:
train_df = pd.read_csv("/kaggle/input/train-ranking/train.csv")
train_df["clean_text"] = train_df["comment_text"].apply(clean_text)
label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
weights = [.32, 1.82, .16, 1.5, .64, 1.5]
train_df["severity"] = train_df[label_cols].dot(weights)

In [5]:
toxic_df = train_df[train_df["severity"] > 0]
non_toxic_df = train_df[train_df["severity"] == 0].sample(n=len(toxic_df), random_state=42)
balanced_df = pd.concat([toxic_df, non_toxic_df]).sample(frac=1.0, random_state=42).reset_index(drop=True)

In [6]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
combined_features = FeatureUnion([
    ("tfidf", tfidf),
    ("extra_features", TextFeatures())
])

pipeline = Pipeline([
    ("features", combined_features),
    ("rf", RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42))
])

In [7]:
pipeline.fit(balanced_df["clean_text"], balanced_df["severity"])

test_df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")
test_df["clean_text"] = test_df["text"].apply(clean_text)
rf_scores = pipeline.predict(test_df["clean_text"])

In [8]:
test_df["score"] = rf_scores
submission = test_df[["comment_id", "score"]]
submission.to_csv("submission.csv", index=False)

In [9]:
val_df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv")
test_df["clean_text"] = test_df["text"].apply(clean_text)
test_map = dict(zip(test_df["clean_text"], test_df["score"]))
val_df["less_toxic_clean"] = val_df["less_toxic"].apply(clean_text)
val_df["more_toxic_clean"] = val_df["more_toxic"].apply(clean_text)

def agree(row):
    a = test_map.get(row["less_toxic_clean"], None)
    b = test_map.get(row["more_toxic_clean"], None)
    if a is None or b is None:
        return None
    return int(a < b)

val_df["agreement"] = val_df.apply(agree, axis=1)
valid_agreement = val_df["agreement"].dropna().mean()

print("Validation Agreement Accuracy:", valid_agreement)

Validation Agreement Accuracy: 0.5481241759558911
