In [None]:
%pip install --upgrade pip
%pip install kagglehub pandas matplotlib seaborn tensorflow scikit-learn imblearn setuptools
%pip install --upgrade tensorflow

## Download

In [46]:
import kagglehub
import pandas as pd
import os


path = kagglehub.dataset_download('uciml/sms-spam-collection-dataset')

df = pd.read_csv(f'{path}/{os.listdir(path)[0]}', encoding='ISO-8859-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'text']

## Preprocessing

In [47]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df['label'] = label_encoder.fit_transform(df['label'])

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df['text'].values.reshape(-1, 1), df['label'])

balanced_df = pd.DataFrame({'text': X_resampled.flatten(), 'label': y_resampled})

balanced_df['label'].value_counts()



label
0    4825
1    4825
Name: count, dtype: int64

## Learning

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(
    balanced_df['text'], balanced_df['label'], test_size=0.2, random_state=42
)

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


y_pred = model.predict(X_test_tfidf)
y_prob = model.predict_proba(X_test_tfidf)[:, 1]

print(classification_report(y_test, y_pred))
print('ROC-AUC:', roc_auc_score(y_test, y_prob))


              precision    recall  f1-score   support

           0       0.98      0.99      0.99       985
           1       0.99      0.98      0.99       945

    accuracy                           0.99      1930
   macro avg       0.99      0.99      0.99      1930
weighted avg       0.99      0.99      0.99      1930

ROC-AUC: 0.9991426959954879


## Testing

In [55]:
text = [
    'Free coupons due to weekend. Hurry up!',
    'New arrival of out paste name product. 99% discount on old items',
    'Where you bought that hoodie?',
    'Where you bought that hoodie? Summer sale on asos.com. 50% discount on every item for two days!',
    'Where you bought that hoodie? Summer sale on asos.com. Upgrade you wardrobe with us!',
    'Summer sale on asos.com. 50% discount on every item for two days!',
    'Summer sale on asos.com. Upgrade you wardrobe with us!'
]

y_manual = model.predict(tfidf.transform(text))
predicted = pd.DataFrame({'text': text, 'label': label_encoder.inverse_transform(y_manual)})
predicted

Unnamed: 0,text,label
0,Free coupons due to weekend. Hurry up!,spam
1,New arrival of out paste name product. 99% discount on old items,spam
2,Where you bought that hoodie?,ham
3,Where you bought that hoodie? Summer sale on asos.com. 50% discount on every item for two days!,spam
4,Where you bought that hoodie? Summer sale on asos.com. Upgrade you wardrobe with us!,ham
5,Summer sale on asos.com. 50% discount on every item for two days!,spam
6,Summer sale on asos.com. Upgrade you wardrobe with us!,spam
