In [None]:
# Turn on Internet in Notebook Settings → Internet → On, then save.

# Install only the extras we truly need:
!pip install beautifulsoup4 joblib pyarrow



In [2]:
import nltk

# Only need stopwords and punkt for our preprocessing
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import pandas as pd

# Adjust to match your dataset slug; Kaggle mounts under /kaggle/input/
df = pd.read_parquet('/kaggle/input/phish-ai-dataset/dataset.parquet')

print("Shape:", df.shape)
print(df['label'].value_counts())
df.head()

Shape: (84438, 2)
label
1    43827
0    40611
Name: count, dtype: int64


Unnamed: 0,body,label
0,,0
1,[zzzzteana] RE: Alexander Martin A posted: Tas...,0
2,[zzzzteana] Moscow bomber Man Threatens Explos...,0
3,[IRR] Klez: The Virus That Won't Die Klez: The...,0
4,> making a pizza a deep-pie; I just had to jum...,0


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

# Split
X_train, X_test, y_train, y_test = train_test_split(
    df['body'], df['label'],
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)

# Vectorizer & classifier
vect = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1,2),
    stop_words='english'
)
clf = LogisticRegression(max_iter=300, C=4)

# Fit
Xtr = vect.fit_transform(X_train)
clf.fit(Xtr, y_train)

# Evaluate
Xte = vect.transform(X_test)
preds = clf.predict(Xte)
probs = clf.predict_proba(Xte)[:,1]

print(classification_report(y_test, preds))
print("ROC‑AUC:", roc_auc_score(y_test, probs))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99      8122
           1       0.99      0.99      0.99      8766

    accuracy                           0.99     16888
   macro avg       0.99      0.99      0.99     16888
weighted avg       0.99      0.99      0.99     16888

ROC‑AUC: 0.9991908980113502


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

pipe = Pipeline([
    ("vect", TfidfVectorizer(stop_words="english")),
    ("clf",  LogisticRegression(solver="liblinear", max_iter=500)),
])
param_dist = {
    "vect__max_features": [20000, 50000],
    "vect__ngram_range": [(1,1), (1,2)],
    "clf__C": np.logspace(-1, 1, 5),
}
search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=6,
    cv=3,
    scoring="roc_auc",
    n_jobs=2,
    random_state=42,
    verbose=2
)
search.fit(X_train, y_train)
print("Best params:", search.best_params_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best params: {'vect__ngram_range': (1, 2), 'vect__max_features': 20000, 'clf__C': 10.0}


In [6]:
import joblib

# Save the vectorizer & classifier
joblib.dump(vect, 'vect.joblib')
joblib.dump(clf,  'clf.joblib')

['clf.joblib']