In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/jigsaw-agile-community-rules/sample_submission.csv
/kaggle/input/jigsaw-agile-community-rules/train.csv
/kaggle/input/jigsaw-agile-community-rules/test.csv


In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)

2025-07-30 10:16:20.823573: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753870580.988819      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753870581.039715      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/train.csv')
df_test = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')

df = df[['body', 'rule_violation']].dropna()
df_train, df_val = train_test_split(df, test_size=0.1, stratify=df['rule_violation'], random_state=42)


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

for df in [df_train, df_test]:
    df['body_len'] = df['body'].apply(len)
    df['num_links'] = df['body'].str.count(r"http\S+")
    df['num_exclaims'] = df['body'].str.count('!')
    df['num_upper'] = df['body'].str.count(r'[A-Z]{2,}')

meta_features = ['body_len', 'num_links', 'num_exclaims', 'num_upper']

# Word-level TF-IDF
word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), max_features=20000, stop_words='english')
X_word = word_vectorizer.fit_transform(df_train['body'])
X_test_word = word_vectorizer.transform(df_test['body'])

# Char-level TF-IDF
char_vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(4, 6), max_features=30000)
X_char = char_vectorizer.fit_transform(df_train['body'])
X_test_char = char_vectorizer.transform(df_test['body'])

# Meta matrisi
X_meta = df_train[meta_features].values
X_test_meta = df_test[meta_features].values


In [5]:
from scipy.sparse import hstack, csr_matrix

X = hstack([X_word, X_char, X_meta])
X_test = hstack([X_test_word, X_test_char, X_test_meta])

X = csr_matrix(X)
X_test = csr_matrix(X_test)
y = df_train['rule_violation'].values


In [6]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
import numpy as np

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(df_train))
test_preds = np.zeros(len(df_test))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    model = LGBMClassifier(n_estimators=500, learning_rate=0.05, random_state=fold)
    model.fit(X_tr, y_tr)

    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(X_test)[:, 1] / skf.n_splits

    score = roc_auc_score(y_val, oof_preds[val_idx])
    print(f"Fold {fold+1} AUC: {score:.4f}")

print("\n✅ Mean OOF AUC:", roc_auc_score(y, oof_preds))


[LightGBM] [Info] Number of positive: 742, number of negative: 718
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037361 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 53912
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 2950
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.508219 -> initscore=0.032880
[LightGBM] [Info] Start training from score 0.032880
Fold 1 AUC: 0.8229
[LightGBM] [Info] Number of positive: 742, number of negative: 719
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036407 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54766
[LightGBM] [Info] Number of data points in the train set: 1461, number of used features: 2987
[Ligh

In [7]:
submission = pd.DataFrame({
    'row_id': df_test['row_id'],
    'rule_violation': test_preds
})
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv başarıyla oluşturuldu.")


✅ submission.csv başarıyla oluşturuldu.
