In [13]:
# Загрузка данных из файла
file_path = './SMSSpamCollection'

# Чтение данных
with open(file_path, 'r', encoding='utf-8') as file:
    data = file.readlines()


In [14]:
import pandas as pd

# Преобразование данных в DataFrame
df = pd.DataFrame([line.strip().split('\t') for line in data], columns=['label', 'message'])

# Просмотр первых строк
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Перевод меток в числовой формат
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Разделение данных на признаки и метки
X = df['message']
y = df['label']

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Векторизация текста
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Обучение модели Наивного Байеса
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Прогнозирование
y_pred = model.predict(X_test_vec)

# Оценка качества модели
report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

report, accuracy

('              precision    recall  f1-score   support\n\n           0       0.99      1.00      0.99      1449\n           1       0.97      0.91      0.94       224\n\n    accuracy                           0.98      1673\n   macro avg       0.98      0.95      0.96      1673\nweighted avg       0.98      0.98      0.98      1673\n',
 0.9838613269575612)

In [25]:
def classify_message(message, model, vectorizer):
    """
    Classifies a text message as spam or ham.
    
    :param message: The text of the message.
    :param model: The trained model.
    :param vectorizer: The text vectorizer.
    :return: Label ('spam' or 'ham').
    """
    # Vectorize the input message
    message_vec = vectorizer.transform([message])
    # Predict the label
    prediction = model.predict(message_vec)[0]
    return 'spam' if prediction == 1 else 'ham'

# Example usage
example_message = "6 chances to win деньги"
result = classify_message(example_message, model, vectorizer)
print("The message is classified as:", result)

The message is classified as: spam
