In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/jigsaw-agile-community-rules/sample_submission.csv
/kaggle/input/jigsaw-agile-community-rules/train.csv
/kaggle/input/jigsaw-agile-community-rules/test.csv


# Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from scipy.sparse import hstack


# DATA

In [3]:
df_train = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/train.csv')
df_test = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')


#  TF-IDF

In [4]:
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2), stop_words='english')
X_tfidf = vectorizer.fit_transform(df_train['body'])
X_test_tfidf = vectorizer.transform(df_test['body'])

df_train['body_len'] = df_train['body'].apply(len)
df_test['body_len'] = df_test['body'].apply(len)

X = hstack([X_tfidf, df_train[['body_len']].values])
X_test = hstack([X_test_tfidf, df_test[['body_len']].values])

y = df_train['rule_violation']

# LightGBM ile 5-Fold CV

In [5]:
from scipy.sparse import csr_matrix

X = csr_matrix(X)         
X_test = csr_matrix(X_test)  

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = LGBMClassifier(n_estimators=500, learning_rate=0.1)
    model.fit(X_train, y_train)
    
    y_val_pred = model.predict_proba(X_val)[:,1]
    score = roc_auc_score(y_val, y_val_pred)
    print(f"Fold {fold+1} AUC: {score:.4f}")
    auc_scores.append(score)

print(f"\nMean AUC: {np.mean(auc_scores):.4f}")


[LightGBM] [Info] Number of positive: 825, number of negative: 798
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002187 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3030
[LightGBM] [Info] Number of data points in the train set: 1623, number of used features: 186
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.508318 -> initscore=0.033275
[LightGBM] [Info] Start training from score 0.033275
Fold 1 AUC: 0.7969
[LightGBM] [Info] Number of positive: 825, number of negative: 798
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002129 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3052
[LightGBM] [Info] Number of data points in the train set: 1623, number of used features: 184
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.508318 -> initscore=0.0

In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = LGBMClassifier(n_estimators=500, learning_rate=0.1)
    model.fit(X_train, y_train)
    
    y_val_pred = model.predict_proba(X_val)[:,1]
    score = roc_auc_score(y_val, y_val_pred)
    print(f"Fold {fold+1} AUC: {score:.4f}")
    auc_scores.append(score)

print(f"\nMean AUC: {np.mean(auc_scores):.4f}")


[LightGBM] [Info] Number of positive: 825, number of negative: 798
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002258 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3030
[LightGBM] [Info] Number of data points in the train set: 1623, number of used features: 186
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.508318 -> initscore=0.033275
[LightGBM] [Info] Start training from score 0.033275
Fold 1 AUC: 0.7969
[LightGBM] [Info] Number of positive: 825, number of negative: 798
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3052
[LightGBM] [Info] Number of data points in the train set: 1623, number of used features: 184
[LightGBM

In [7]:
final_model = LGBMClassifier(n_estimators=500, learning_rate=0.1)
final_model.fit(X, y)

y_test_pred = final_model.predict_proba(X_test)[:,1]


[LightGBM] [Info] Number of positive: 1031, number of negative: 998
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005290 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4170
[LightGBM] [Info] Number of data points in the train set: 2029, number of used features: 253
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.508132 -> initscore=0.032531
[LightGBM] [Info] Start training from score 0.032531


In [8]:
submission = pd.DataFrame({
    'row_id': df_test['row_id'],
    'rule_violation': y_test_pred
})

submission.to_csv('submission.csv', index=False)
print("✅ submission.csv başarıyla oluşturuldu.")


✅ submission.csv başarıyla oluşturuldu.
