# phase 3.1 - model training: Naive Bayes

In [None]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
import pickle
import os
import ast
from sklearn.feature_extraction import DictVectorizer

## 5-star classification

In [None]:
def train_native_bayes_on_bow_5_stars(train_texts_path: str, train_labels_path: str, model_path: str) -> None:
    # load data
    X_train_raw = pd.read_csv(train_texts_path, index_col=0)    
    y_train = pd.read_csv(train_labels_path, index_col=0).squeeze()

    # align indexes
    X_train_raw, y_train = X_train_raw.align(y_train, join='inner', axis=0)

    # convert strings to dictrionaries
    X_train_dicts = X_train_raw.iloc[:, 0].apply(ast.literal_eval)

    # convert to matrix
    vectorizer = DictVectorizer(sparse=True)
    X_train = vectorizer.fit_transform(X_train_dicts)

    # train Naive Bayes
    model = MultinomialNB()
    model.fit(X_train, y_train)

    # pack model and vectorizer
    model_package = {
        'vectorizer': vectorizer,
        'model': model
    }

    # save to one file
    with open(model_path, 'wb') as file:
        pickle.dump(model_package, file)

In [None]:
# 70/30 version

train_native_bayes_on_bow_5_stars(
    train_texts_path='../data/70_30/train_texts_bow.csv', 
    train_labels_path='../data/70_30/train_labels.csv', 
    model_path='../models/BoW_70_30.pkl'
    )

In [None]:
# 80/20 version

train_native_bayes_on_bow_5_stars(
    train_texts_path='../data/80_20/train_texts_bow.csv', 
    train_labels_path='../data/80_20/train_labels.csv', 
    model_path='../models/BoW_80_20.pkl'
    )

## positive/neutral/negative classification

In [29]:
def train_native_bayes_on_bow_pnn(train_texts_path: str, train_labels_path: str, model_path: str) -> None:
    # load data
    X_train_raw = pd.read_csv(train_texts_path, index_col=0)    
    y_train = pd.read_csv(train_labels_path, index_col=0).squeeze()

    # map stars to sentiments
    def map_sentiment(star_rating):
        if star_rating in [4, 5]:
            return 'Positive'
        elif star_rating == 3:
            return 'Neutral'
        else:
            return 'Negative'

    y_train = y_train.map(map_sentiment)

    # align indexes
    X_train_raw, y_train = X_train_raw.align(y_train, join='inner', axis=0)

    # convert strings to dictrionaries
    X_train_dicts = X_train_raw.iloc[:, 0].apply(ast.literal_eval)

    # convert to matrix
    vectorizer = DictVectorizer(sparse=True)
    X_train = vectorizer.fit_transform(X_train_dicts)

    # train Naive Bayes
    model = MultinomialNB()
    model.fit(X_train, y_train)

    # pack model and vectorizer
    model_package = {
        'vectorizer': vectorizer,
        'model': model
    }

    # save to one file
    with open(model_path, 'wb') as file:
        pickle.dump(model_package, file)

In [30]:
# 70/30 version

train_native_bayes_on_bow_pnn(
    train_texts_path='../data/70_30/train_texts_bow.csv',
    train_labels_path='../data/70_30/train_labels.csv',
    model_path='../models/BoW_70_30_PNN.pkl'
    )

In [None]:
# 80/20 version

train_native_bayes_on_bow_pnn(
    train_texts_path='../data/80_20/train_texts_bow.csv',
    train_labels_path='../data/80_20/train_labels.csv',
    model_path='../models/BoW_80_20_PNN.pkl'
    )