<a href="https://colab.research.google.com/github/deniss97/Baron/blob/master/%D0%9E%D1%82%D0%B1%D0%BE%D1%802.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import optuna
import wandb
from functools import partial

train_df = pd.read_csv('train_spam.csv')
test_df = pd.read_csv('test_spam.csv')

# RandomForest_improved2

In [5]:
from sklearn.calibration import CalibratedClassifierCV
from scipy.sparse import hstack
from textblob import TextBlob
from sklearn.preprocessing import StandardScaler

def extract_features(data):
    data['text_length'] = data['text'].apply(len)
    data['unique_words'] = data['text'].apply(lambda x: len(set(x.split())))
    data['sentiment'] = data['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
    return data

wandb.init(project='spam_detection_improved', name='RandomForest_improved2')

train_df = extract_features(train_df)
test_df = extract_features(test_df)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X_text = vectorizer.fit_transform(train_df['text'])
X_test_text = vectorizer.transform(test_df['text'])

scaler = StandardScaler()
additional_features = scaler.fit_transform(train_df[['text_length', 'unique_words', 'sentiment']])
additional_features_test = scaler.transform(test_df[['text_length', 'unique_words', 'sentiment']])

X = hstack([X_text, additional_features])
X_test = hstack([X_test_text, additional_features_test])

y = train_df['text_type'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 10, 100),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'class_weight': 'balanced'
    }

    model = RandomForestClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')
    roc_auc = scores.mean()
    wandb.log({'roc_auc': roc_auc})
    return roc_auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

best_params = study.best_params
best_model = RandomForestClassifier(**best_params)

calibrated_model = CalibratedClassifierCV(best_model, method='sigmoid', cv=5)
calibrated_model.fit(X_train, y_train)

test_pred = calibrated_model.predict_proba(X_test)[:, 1]
test_df['score'] = test_pred
test_df[['text', 'score']].to_csv('spam_predictions_improved.csv', index=False)

wandb.finish()


[I 2024-05-02 13:29:42,299] A new study created in memory with name: no-name-5de31933-8292-4013-a63c-cf1d47e51cb7
[I 2024-05-02 13:31:25,710] Trial 0 finished with value: 0.9716259468024064 and parameters: {'n_estimators': 934, 'max_depth': 67, 'min_samples_split': 9, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'criterion': 'entropy'}. Best is trial 0 with value: 0.9716259468024064.
[I 2024-05-02 13:31:52,987] Trial 1 finished with value: 0.9776818881964531 and parameters: {'n_estimators': 717, 'max_depth': 53, 'min_samples_split': 17, 'min_samples_leaf': 10, 'max_features': 'log2', 'criterion': 'entropy'}. Best is trial 1 with value: 0.9776818881964531.
[I 2024-05-02 13:32:33,092] Trial 2 finished with value: 0.975245515393533 and parameters: {'n_estimators': 250, 'max_depth': 49, 'min_samples_split': 14, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'criterion': 'gini'}. Best is trial 1 with value: 0.9776818881964531.
[I 2024-05-02 13:32:39,956] Trial 3 finished with value: 0.960

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
roc_auc,▅▇▆▃▁▆▆▅▇█▇▇▇██▇█▇▇▆

0,1
roc_auc,0.97706


# RandomForest_improved3

In [3]:
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.calibration import CalibratedClassifierCV
import re
from textblob import TextBlob
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import wandb
import optuna

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
wandb.init(project="spam_detection_improved", name="RandomForest_improved3")

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word.isalpha()]
    return ' '.join(tokens)

def extract_features(data):
    data['text'] = data['text'].apply(preprocess_text)
    data['text_length'] = data['text'].apply(len)
    data['unique_words'] = data['text'].apply(lambda x: len(set(x.split())))
    data['sentiment'] = data['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
    data['links_count'] = data['text'].apply(lambda x: len(re.findall(r'http[s]?://\S+', x)))
    data['email_count'] = data['text'].apply(lambda x: len(re.findall(r'\S+@\S+', x)))
    return data

train_df = extract_features(train_df)
test_df = extract_features(test_df)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X_text = vectorizer.fit_transform(train_df['text'])
X_test_text = vectorizer.transform(test_df['text'])

scaler = StandardScaler()
additional_features = scaler.fit_transform(train_df[['text_length', 'unique_words', 'sentiment', 'links_count', 'email_count']])
additional_features_test = scaler.transform(test_df[['text_length', 'unique_words', 'sentiment', 'links_count', 'email_count']])

X_combined = hstack([X_text, additional_features])
X_test_combined = hstack([X_test_text, additional_features_test])

ipca = IncrementalPCA(n_components=100)
X_train_pca = ipca.fit_transform(X_combined.toarray())
X_test_pca = ipca.transform(X_test_combined.toarray())

y = train_df['text_type'].values
X_train, X_val, y_train, y_val = train_test_split(X_train_pca, y, test_size=0.2, stratify=y, random_state=42)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 10, 100),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'class_weight': 'balanced'
    }
    model = RandomForestClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')
    roc_auc = scores.mean()
    wandb.log({'roc_auc': roc_auc})
    return roc_auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

best_params = study.best_params
best_model = RandomForestClassifier(**best_params)
calibrated_model = CalibratedClassifierCV(best_model, method='sigmoid', cv=5)
calibrated_model.fit(X_train, y_train)

test_pred = calibrated_model.predict_proba(X_test_pca)[:, 1]
test_df['score'] = test_pred
test_df.to_csv('spam_predictions_improved.csv', index=False)

wandb.finish()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[I 2024-05-02 11:26:27,851] A new study created in memory with name: no-name-e98f0c7e-00d0-4215-978d-5b26bec3c609
[I 2024-05-02 11:36:50,058] Trial 0 finished with value: 0.9745904743007175 and parameters: {'n_estimators': 891, 'max_depth': 46, 'min_samples_split': 14, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'criterion': 'entropy'}. Best is trial 0 with value: 0.9745904743007175.
[I 2024-05-02 11:47:29,046] Trial 1 finished with value: 0.972774607682209 and parameters: {'n_estimators': 992, 'max_depth': 11, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'criterion': 'entropy'}. Best is trial 0 with value: 0.9745904743007175.
  warn(
  warn(
  warn(
  warn(
  warn(
[I 2024-05-02 11:57:31,954] Trial 2 finished with value: 0.9722494735327425 and parameters: {'n_estimators': 955, 'max_depth': 40, 'min_samples_split': 19, 'min_samples_leaf': 10, 'max_features': 'auto', 'criterion': 'entropy'}. Best is trial 0 with value: 0.9745904743007175.
  warn(
  warn(
  w

VBox(children=(Label(value='0.001 MB of 0.022 MB uploaded\r'), FloatProgress(value=0.04723963775687914, max=1.…

0,1
roc_auc,█▆▅▆▄█▄▅▂▁▆████▇▇▇▇█

0,1
roc_auc,0.97442
