In [2]:
import pandas as pd
import re
from pathlib import Path

data_path = Path("../data/raw/spam.csv")
df = pd.read_csv(data_path, encoding='latin-1')

df = df.iloc[:, :2]
df.columns = ['label', 'message']

print(f"Данные загружены. Размер: {df.shape}")
print("\nПервые 3 строки:")
df.head(3)

Данные загружены. Размер: (5572, 2)

Первые 3 строки:


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [3]:
print("Распределение классов:")
class_counts = df['label'].value_counts()
print(class_counts)

spam_ratio = class_counts['spam'] / len(df)
print(f"\nДоля спама: {spam_ratio:.2%}")

Распределение классов:
label
ham     4825
spam     747
Name: count, dtype: int64

Доля спама: 13.41%


In [4]:
print("Пример сообщения-спама:")
print(df[df['label'] == 'spam']['message'].iloc[0])

print("\nПример сообщения, не являющегося спамом:")
print(df[df['label'] == 'ham']['message'].iloc[0])

Пример сообщения-спама:
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

Пример сообщения, не являющегося спамом:
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


In [5]:
import nltk

nltk.download('punkt', quiet=True)       
nltk.download('stopwords', quiet=True)   
nltk.download('wordnet', quiet=True)

True

In [6]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import TreebankWordTokenizer

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'
    elif treebank_tag.startswith('V'):
        return 'v' 
    elif treebank_tag.startswith('N'):
        return 'n' 
    elif treebank_tag.startswith('R'):
        return 'r' 
    else:
        return 'n'

def clean_text(text):
    """
    Предобработка текста с лемматизацией.
    """
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    
    tokenizer = TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words and len(w) >= 3]
    
    lemmatizer = WordNetLemmatizer()
    pos_tags = pos_tag(tokens)
    tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
    
    return ' '.join(tokens)

In [7]:
sample_message = df['message'].iloc[0]
print("Исходный текст:")
print(repr(sample_message))

print("\nОчищенный текст:")
cleaned = clean_text(sample_message)
print(repr(cleaned))

print("\nСлова после обработки:")
print(cleaned.split())

Исходный текст:
'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

Очищенный текст:
'jurong point crazy available bugis great world buffet cine get amore wat'

Слова после обработки:
['jurong', 'point', 'crazy', 'available', 'bugis', 'great', 'world', 'buffet', 'cine', 'get', 'amore', 'wat']


In [8]:
df['cleaned'] = df['message'].apply(clean_text)
print("Обработка завершена. Пример:")
print(df[['message', 'cleaned']].head(2))

Обработка завершена. Пример:
                                             message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   

                                             cleaned  
0  jurong point crazy available bugis great world...  
1                                 lar joking wif oni  


In [9]:
from collections import defaultdict

vocab = defaultdict(lambda: {'spam': 0, 'ham': 0})

for _, row in df.iterrows():
    label = row['label']
    words = row['cleaned'].split()
    for word in words:
        vocab[word][label] += 1

print(f"Словарь построен. Размер: {len(vocab)} слов")

Словарь построен. Размер: 7246 слов


In [10]:
MIN_FREQ = 2

filtered_vocab = {
    word: counts
    for word, counts in vocab.items()
    if counts['spam'] + counts['ham'] >= MIN_FREQ
}

print(f"После фильтрации: {len(filtered_vocab)}, до неё слов (было {len(vocab)})")

После фильтрации: 3231, до неё слов (было 7246)


In [11]:
total_spam = (df['label'] == 'spam').sum()
total_ham = (df['label'] == 'ham').sum()
total = len(df)

p_spam = total_spam / total
p_ham = total_ham / total

print(f"P(spam) = {p_spam:.4f}")
print(f"P(ham)  = {p_ham:.4f}")

P(spam) = 0.1341
P(ham)  = 0.8659


In [12]:
V = len(filtered_vocab)

def laplace_prob(count, total, V):
    return (count + 1) / (total + V)

total_words_spam = sum(counts['spam'] for counts in filtered_vocab.values())
total_words_ham = sum(counts['ham'] for counts in filtered_vocab.values())

word = "free"
if word in filtered_vocab:
    p_free_given_spam = laplace_prob(filtered_vocab[word]['spam'], total_words_spam, V)
    p_free_given_ham  = laplace_prob(filtered_vocab[word]['ham'],  total_words_ham,  V)
    print(f"P('{word}' | spam) = {p_free_given_spam:.6f}")
    print(f"P('{word}' | ham)  = {p_free_given_ham:.6f}")
else:
    print(f"Слово '{word}' отфильтровано или не встречается.")

P('free' | spam) = 0.017923
P('free' | ham)  = 0.001744


In [13]:
import os

os.makedirs("../data/processed", exist_ok=True)

df.to_csv("../data/processed/spam_cleaned.csv", index=False)
print("Полный обработанный датасет сохранён: data/processed/spam_cleaned.csv")

Полный обработанный датасет сохранён: data/processed/spam_cleaned.csv


In [15]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)

print(f"Train: {len(df_train)} сообщений ({df_train['label'].value_counts().to_dict()})")
print(f"Test:  {len(df_test)} сообщений ({df_test['label'].value_counts().to_dict()})")

Train: 4457 сообщений ({'ham': 3859, 'spam': 598})
Test:  1115 сообщений ({'ham': 966, 'spam': 149})


In [16]:
df_train.to_csv("../data/processed/train.csv", index=False)
df_test.to_csv("../data/processed/test.csv", index=False)

print("Train и test сохранены:")
print("- data/processed/train.csv")
print("- data/processed/test.csv")

Train и test сохранены:
- data/processed/train.csv
- data/processed/test.csv
