## Практическое занятие 3. Наивный байесовский классификатор
<br><br>

<br>

In [1]:
import re, string

import pymorphy2
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, f1_score

import warnings
warnings.filterwarnings('ignore')

##### Чтение и загрузка

In [14]:
#read
df_init = pd.read_csv("./movie.csv").rename(columns = {'label':'target'})

##### Предобработка

In [31]:
#preprocess
def preprocess(text):
    
    #нижний регистр
    text = text.lower()
    
    #удаление спецсимволов
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub(r'[^\w\s]',' ',str(text).lower().strip())
    text = re.sub(r'\s+',' ',text)
    
    #удаление цифр
    text = re.sub(r"\d+", "", text, flags=re.UNICODE)
    
    #удаление лишних пробелов
    text_new = text.strip()
    
    return text_new
 
def lematize(text):
    
    morph = pymorphy2.MorphAnalyzer(lang='uk')
    
    text_new = [morph.parse(word)[0].normal_form for word in word_tokenize(text)]
    text_new = ' '.join(text_new)
    
    return text_new

def stopword(text):
    
    stopwords_ = stopwords.words('english')
    
    text_new = [word for word in word_tokenize(text) if word not in stopwords_]
    text_new = ' '.join(text_new)
    
    return text_new

def process_data(data):
        
    #train
    data['text'] = data['text'].apply(preprocess)
#     data['text'] = data['text'].apply(lematize)
    data['text'] = data['text'].apply(stopword)
        
    return data

df = process_data(df_init)

##### Разделение выборки на обучающую и тестовую, используя стратификацию.

In [48]:
#train_test_split
x = df['text']
y = df['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state = 42, stratify = y)

In [49]:
# # Count Vectorizer
# vectorizer = CountVectorizer(max_features = 2000)

# x_train = vectorizer.fit_transform(x_train)
# x_test = vectorizer.transform(x_test)

# x_train = pd.DataFrame.sparse.from_spmatrix(x_train, columns = vectorizer.get_feature_names())
# x_test  = pd.DataFrame.sparse.from_spmatrix(x_test, columns = vectorizer.get_feature_names())

In [50]:
# TF IDF vectorizer - лучше себя показал чем Count Vectorizer
vectorizer = TfidfVectorizer(use_idf=True, max_features = 2000)

x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

x_train = pd.DataFrame.sparse.from_spmatrix(x_train, columns = vectorizer.get_feature_names())
x_test  = pd.DataFrame.sparse.from_spmatrix(x_test, columns = vectorizer.get_feature_names())

##### Обучение Модели

In [54]:
#fit train
clf = MultinomialNB()
clf.fit(x_train,y_train)

MultinomialNB()

##### Метрики

In [55]:
#accuracy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state = 42)

scores = cross_validate(clf, x_train, y_train, scoring = 'accuracy', cv=cv, return_train_score=True)

score_train = np.mean(scores['train_score'])
score_test  = np.mean(scores['test_score'])

display(score_train)
display(score_test)

0.8579749999999999

0.8501

In [9]:
#f1-score
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state = 42)

scores = cross_validate(clf, x_train, y_train, scoring = 'f1', cv=cv, return_train_score=True)

score_train = np.mean(scores['train_score'])
score_test  = np.mean(scores['test_score'])

display(score_train)
display(score_test)

0.8596716792015264

0.8520866313886521