In [5]:
import pandas as pd
import torch
from torchtext.legacy import data
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline



In [9]:

nlp = spacy.load('en_core_web_sm')
SEED = 4444
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
train_data = pd.read_csv('train_spam.csv')
test_data = pd.read_csv('test_spam.csv')


from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    train_data['text'],
    train_data['text_type'],
    test_size=0.2,
    random_state=SEED
)

print(train_data['text_type'].value_counts())


tfidf_vect = TfidfVectorizer(stop_words='english', max_features=1000)


X_train, X_val, y_train, y_val = train_test_split(train_data['text'], train_data['text_type'], test_size=0.2, random_state=SEED)


model = Pipeline([
    ('tfidf', tfidf_vect),
    ('clf', MultinomialNB())
])


model.fit(X_train, y_train)


y_pred = model.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_pred)
print(f'ROC-AUC score: {roc_auc}')


text_type
ham     11469
spam     4809
Name: count, dtype: int64
ROC-AUC score: 0.9544561907555054


In [11]:

test_texts = test_data['text'].tolist()

test_probs = model.predict_proba(test_texts)[:, 1]  

results = pd.DataFrame({'text': test_texts, 'score': test_probs})

results.to_csv('test_predictions.csv', index=False)
print(results)


                                                   text     score
0     j jim whitehead ejw cse ucsc edu writes j you ...  0.047463
1     original message from bitbitch magnesium net p...  0.069593
2     java for managers vince durasoft who just taug...  0.005510
3                  there is a youtuber name saiman says  0.072553
4     underpriced issue with high return on equity t...  0.173590
...                                                 ...       ...
4065  husband to wifetum meri zindagi hoorwifeor kya...  0.247623
4066  baylor enron case study cindy yes i shall co a...  0.000059
4067                           boring as compared to tp  0.295270
4068  hellogorgeous hows u my fone was on charge lst...  0.143612
4069  energy conference mark we are really swamped a...  0.000412

[4070 rows x 2 columns]
