In [27]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk import word_tokenize, wordpunct_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, train_test_split, learning_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 


path = 'C:/Users/S451/Desktop/nlp/06.03_ml_hw_pr/smsspamcollection/SMSSpamCollection'
messages = pd.read_csv(path, sep='\t', names=["label", "message"])

# 1 задание

In [2]:
messages.groupby('label').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,message
label,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,count,4825
ham,unique,4516
ham,top,"Sorry, I'll call later"
ham,freq,30
spam,count,747
spam,unique,653
spam,top,Please call our customer service representativ...
spam,freq,4


Выборка несбалансирована: 4825 сообщений ham и 747 сообщений spam
DummyCassifier в этом случае будет помечать как ham каждое сообщение, что, конечно, дает хорошую точность, но с задачей определения спама не справляется

In [3]:
from sklearn.utils import shuffle

training_set_spam = messages[messages['label'] == 'spam'][:700]
training_set_ham = messages[messages['label'] == 'ham'][:700]
training_set = pd.concat([training_set_spam, training_set_ham])
training_set = shuffle(training_set)

In [4]:
def model(parameter_name, parameter, training_set):
    if parameter_name == '0':
        bow = CountVectorizer()
    elif parameter_name == 't':
        bow = CountVectorizer(tokenizer=parameter)
    elif parameter_name == 'sw':
        bow = CountVectorizer(stop_words='english')
    elif parameter_name == 'min_df':
        bow = CountVectorizer(min_df=parameter)
    elif parameter_name == 'max_df':
        bow = CountVectorizer(max_df=parameter)
    elif parameter_name == 'vectorizer':
        bow = TfidfVectorizer()
    
    bow.fit_transform(training_set['message'])

    bowed_messages = bow.transform(training_set['message'])

    naive_model = MultinomialNB()
    naive_model.fit(bowed_messages, training_set['label'])

    label_pred = naive_model.predict(bowed_messages)
    cv_results = cross_val_score(naive_model, bowed_messages, training_set['label'], cv=10, scoring='accuracy')
    return [round(cv_results.mean(), 3), round(cv_results.std(), 3)]

In [5]:
print(model('0', 0, training_set))

[0.95899999999999996, 0.021999999999999999]


1) разная токенизация: в одном случае знаки препинания удалять, в другом — считать их токенами

In [6]:
print(model('t', word_tokenize, training_set))

[0.95799999999999996, 0.02]


In [7]:
print(model('t', wordpunct_tokenize, training_set))

[0.97299999999999998, 0.016]


При подсчете знаков препинания результат улучшился

2) лемматизация (отсутствие лемматизации, стемминг, лемматизация)

In [8]:
stemmer = PorterStemmer()

def stemming(text):
    text = wordpunct_tokenize(text.lower())
    return [stemmer.stem(i) for i in text]

In [9]:
lem = WordNetLemmatizer()

def lemming(text):
    text = wordpunct_tokenize(text.lower())
    return [lem.lemmatize(i) for i in text]

In [10]:
print(model('t', stemming, training_set))

[0.97199999999999998, 0.017000000000000001]


In [11]:
print(model('t', lemming, training_set))

[0.97399999999999998, 0.016]


Стемминг и лемминг дают примерно одинаковый результат результат

3) удаление стоп-слов, а также пороги минимальной и максимальной document frequency

In [12]:
print(model('sw', 0, training_set))

[0.95099999999999996, 0.021000000000000001]


In [13]:
print(model('min_df', 3, training_set))

[0.95699999999999996, 0.016]


In [14]:
print(model('max_df', 0.3, training_set))

[0.95799999999999996, 0.021000000000000001]


все это плохо сказывается на результате (хуже, чем было)

4) векторизация документов (CountVectorizer vs. TfIdfVectorizer)

In [15]:
print(model('vectorizer', 0, training_set))

[0.95999999999999996, 0.021999999999999999]


тоже не так хорошо

# 2 задание

In [29]:
bow = CountVectorizer(tokenizer=wordpunct_tokenize)
bow.fit_transform(training_set['message'])
bowed_messages = bow.transform(training_set['message'])

clf = DecisionTreeClassifier()
clf.fit(bowed_messages, training_set['label'])
pred = cross_val_predict(clf, bowed_messages, training_set['label'], cv=10)
print(classification_report(training_set['label'], pred, target_names=['Ham', 'Spam']))

             precision    recall  f1-score   support

        Ham       0.94      0.94      0.94       700
       Spam       0.94      0.94      0.94       700

avg / total       0.94      0.94      0.94      1400



In [30]:
rfcl = RandomForestClassifier()
rfcl.fit(bowed_messages, training_set['label'])
pred = cross_val_predict(rfcl, bowed_messages, training_set['label'], cv=10)
print(classification_report(training_set['label'], pred, target_names=['Ham', 'Spam']))

             precision    recall  f1-score   support

        Ham       0.92      0.99      0.95       700
       Spam       0.99      0.91      0.95       700

avg / total       0.96      0.95      0.95      1400



Деревья и random forest хуже, чем Байес. А random forest лучше, чем деревья