Note to devs :")
* link to overleaf document: https://www.overleaf.com/7882162453ngtnygcgqysw
* take a look at TFIDF (term frequency inverse document frequency), may be useful if there is a distinction between good and bad reviews
* bag of words (performance of this is usually lower) or write it in the paper bc it is lower
* https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
* https://github.com/shekhargulati/sentiment-analysis-python/tree/master/opinion-lexicon-English
* N-gram list: http://phrasesinenglish.org/explorengrams.html#filterdiv (we should cite it, downloaded top 100k)
* https://towardsdatascience.com/stemming-lemmatization-what-ba782b7c0bd8 (being tested rn)
* https://docs.ray.io/en/latest/tune/index.html


# Task 1:  Acquire and preprocess the data
See `pre_process.ipynb` for majority of the work. It is used to create the csv where we can load here and train

For imdb dataset:
* negative review will be mapped to 0
* positive review will be mapped to 1


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import itertools
from random import randrange
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.utils import shuffle
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from datetime import datetime
from zoneinfo import ZoneInfo

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize


from statistics import mean
import logging

logging.basicConfig(level=logging.INFO)

# dark theme compability
plt.rcParams.update({"figure.facecolor": (1.0, 1.0, 1.0, 1)})


def evaluate_acc(test, pred):
    return np.sum(pred == test) / test.shape[0]

def print_acc_err(res):
    acc = res[0]
    err = res[1]
    logging.info("ACC: avg: {}, {}".format(mean(acc), acc))
    logging.info("ERR: avg: {}, {}".format(mean(err), err))

In [None]:
class BatchTraining:
    """
    Batch training allows user to save memory by recycling the same variables.
    Testing NB is memory intensive due ot the sparse matrix
    """

    def __init__(self, X_test: np.array, y_test: np.array, vectorizer, batch_size=1000):
        """
        :param X_test: dataset to test
        :param y_test: true labels
        :param vectorizer: vectorizer (SKL TFIDF or CounvtVect)
        :param batch_size: size of training every time
        """
        self.X = X_test
        self.y = y_test
        self.batch_size = batch_size
        self.vectorizer = vectorizer

    def __len__(self):
        """
        :return: number of batches to train
        """
        return (np.ceil(len(self.X) / float(self.batch_size))).astype(int)

    def __iter__(self):
        """
        Example:
        >>> bt = BatchTraining(X,y,vec)
        ... for X, y in bt:
        :return: (X_test_batch, y_test_batch)
        """
        for idx in range(len(self)):
            batch_x = self.vectorizer.transform(
                self.X[idx * self.batch_size : (idx + 1) * self.batch_size]
            ).toarray()
            batch_y = self.y[idx * self.batch_size : (idx + 1) * self.batch_size]
            yield batch_x, batch_y

# Task 2: Implement Naive Bayes and k-fold cross validation

In [None]:

class BernoulliBayes:
    _alpha = 1.
    _num_classes = 0
    _fit = None

    def __init__(self, alpha=1):
        self._alpha = alpha

    def fit(self, train_x, train_y):

        self._num_classes = np.amax(train_y) + 1

        # intialization of list containing count of occurrences of each class
        class_count =  self._num_classes*[0]

        #count occurences of each class
        for i in train_y:
            class_count[i] = 1 + class_count[i]

        #initialization of matrix
        fit = np.zeros((self._num_classes, train_x.shape[1] + 1))


        # fills matrix with # of feature occurrences per class then divides by # of class occurrences
        for i in range(self._num_classes):
            for n, element in enumerate(train_y):
                if element == i:
                    fit[i, :-1] = train_x[n] + fit[i, :-1]
            likelihood = (fit[i, :-1] + self._alpha)/(float(class_count[i]) + 2. * self._alpha)
            fit[i, :-1] = likelihood
            prior = class_count[i]/train_x.shape[0]
            fit[i, -1] = prior

        self._fit = fit

    def predict(self,val_x, val_y):

        res = np.zeros((self._num_classes, val_x.shape[0]), dtype=np.float32)

        # adding class prior probability
        for C in range(self._num_classes):
            log_neg = 1 - self._fit[C, -1]
            prior = self._fit[C, -1]

            res[C] += np.log(prior/log_neg)

        likelihood = self._fit[:, :-1]
        res += np.log(likelihood) @ val_x.T
        res += (np.log(1 - likelihood).sum(axis=1).reshape((-1, 1))) - (np.log(1 - likelihood) @ val_x.T)


        return res.T

        # print(res.T)
        # print("what is this", self._fit[:, :-1])

        # predictions = []
        # for example in res.T:
        #     predictions.append(np.argmax(example))

        # return predictions
        # print("predictions", predictions)

        # print("accuracy: " + str(np.sum(predictions == val_y)/len(predictions)))


In [None]:
class MultiNomialBayes:
    _alpha = 1.
    _num_classes = 0
    _fit = None

    def __init__(self, alpha=1):
        self._alpha = alpha

    def fit(self, train_x, train_y):

        #_num_classes is C in TA's code
        self._num_classes = np.amax(train_y) + 1


        # generates list containing a count of each class occurrence
        #occurences is Nc in TA's code
        class_count = self._num_classes*[0] 

        for i in train_y:
            class_count[i] = 1 + class_count[i]

        fit = np.zeros((self._num_classes, train_x.shape[1]+1))

        # print(class_count)

        # fills matrix with # of feature occurrences per class then divides by # of class occurrences
        for i in range(self._num_classes):
            for n, element in enumerate(train_y):
                if element == i:
                    fit[i, :-1] = train_x[n] + fit[i, :-1]

            #filling likelihoods for each entry
            likelihood = ((fit[i, :-1]) + self._alpha)/(float(class_count[i]) + train_x.shape[1]*self._alpha)
            fit[i, :-1] = likelihood
            #inserting prior in the last column of the array
            prior = class_count[i]/train_x.shape[0]
            fit[i, -1] = prior

        
        self._fit = fit


    def predict(self, val_x, val_y):
        #initializing matrix D*C
        res = np.zeros((self._num_classes, val_x.shape[0]), dtype=np.float32)


        for C in range(self._num_classes):
            prior = self._fit[C, -1]
            # prior_neg = 1 - prior
            prior = np.log(prior)
            res[C] += prior
        likelihood = self._fit[:, :-1]
        likelihood = np.log(likelihood) @ val_x.T
        res += likelihood 


        return res.T

        # predictions = []
        # for example in res.T:
        #     predictions.append(np.argmax(example))

        # print("predictions", predictions)

        # print("accuracy: " + str(np.sum(predictions == val_y)/len(predictions)))


In [3]:
class CrossVal:
    def __init__(
        self,
        X: pd.Series,
        y: pd.Series,
        n_fold=5,
        loss_fnc=lambda y, yh: np.mean((y - yh) ** 2),
    ):
        self.X = X.rename("X")
        self.y = y.rename("y")
        self.n_fold = n_fold
        self.loss_fnc = loss_fnc

    def __len__(self):
        return (np.ceil(self.X.shape[0] / float(self.n_fold))).astype(int)

    def __cross_validation_split(self):
        for idx in range(self.n_fold):
            s = idx * len(self)
            e = (idx + 1) * len(self)
            logging.info(
                "{} Starting CV {}/{} Test_Set[{}:{}]".format(
                    datetime.now(tz=ZoneInfo(key="America/Toronto")).strftime(
                        "%Y-%m-%d %H:%M:%S"
                    ),
                    idx,
                    self.n_fold,
                    s,
                    e,
                )
            )

            #  recall that drop does not affect the original dataframe unless you put inplate=True
            x_train = self.X.drop(self.X.index[s:e])
            y_train = self.y.drop(self.y.index[s:e])

            x_test = self.X[s:e]
            y_test = self.y[s:e]
            yield x_train.to_numpy(), x_test.to_numpy(), y_train.to_numpy(), y_test.to_numpy()

    def kfoldCV_custom_size(self, model, vectorizer, train_size):
        """
        May not be able to perform the entire dataset CV.
        But it will perform K times with a random state
        There might be repeated datapoints in a train/test set.
        On average it should produce the same result
        """

        if not 0 < train_size < 1:
            raise ValueError("Train size needs to be within ]0,1[")

        combined = pd.concat([self.X, self.y], axis=1)
        kfold_acc = []
        kfold_err = []
        for fold in range(self.n_fold):
            logging.info(
                "{} Starting CV {}/{} Train={}, Test={}".format(
                    datetime.now(tz=ZoneInfo(key="America/Toronto")).strftime(
                        "%Y-%m-%d %H:%M:%S"
                    ),
                    fold,
                    self.n_fold,
                    train_size,
                    (1 - train_size),
                )
            )

            test_set = combined.sample(frac=(1 - train_size))
            train_set = combined[~combined.isin(test_set)].dropna()

            x_train = train_set["X"].to_numpy()
            x_test = test_set["X"].to_numpy()

            y_train = train_set["y"].to_numpy()
            y_test = test_set["y"].to_numpy()

            model.fit(vectorizer.fit_transform(x_train), y_train)
            y_predict = model.predict(vectorizer.transform(x_test))
            acc = evaluate_acc(y_test, y_predict)
            err = self.loss_fnc(y_test, y_predict)
            kfold_acc.append(acc)
            kfold_err.append(err)
        return kfold_acc, kfold_err

    def kfoldCV(self, model, vectorizer, **kwargs):
        """
        model: NB, LR. your model needs to have fit and predict as functions at least
        vectorizer: CV, TFIDF
        """
        kfold_acc = []
        kfold_err = []

        for x_train, x_test, y_train, y_test in self.__cross_validation_split():
            # todo: might need to use batch trainer
            model.fit(vectorizer.fit_transform(x_train), y_train)
            y_predict = model.predict(vectorizer.transform(x_test))
            acc = evaluate_acc(y_test, y_predict)
            err = self.loss_fnc(y_test, y_predict)
            kfold_acc.append(acc)
            kfold_err.append(err)
        return kfold_acc, kfold_err

    def repeat(self, **kwargs):
        """
        *args, **kwargs

        pseudo: sklearn gridsearchcv

        self === this in java
        """
        # done in project2-LR
        pass


# Task 3: Run experiments

## Load datasets

In [None]:
imdb_df = pd.read_csv("dataset/imdb_row_array_bigram.csv")

# keep random state so we can have a reproducable result
imdb_df = shuffle(imdb_df, random_state=1)

imdb_df["sentence"] = imdb_df["sentence"].apply(lambda x: " ".join(eval(x)))
imdb_df.loc[imdb_df["review_type"] == "pos", "review_type"] = 1
imdb_df.loc[imdb_df["review_type"] == "neg", "review_type"] = 0


# int32 is more memory efficient and enough for our needs
imdb_df = imdb_df.astype(
    {"review_id": "int32", "review_type": "int32", "review_number": "int32"}
)

imdb_df_X = imdb_df["sentence"]
imdb_df_y = imdb_df["review_type"]

imdb_df_CV = CrossVal(imdb_df_X, imdb_df_y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(imdb_df_X, imdb_df_y, test_size=0.33, random_state=42)



bb = BernoulliBayes()
cv =  TfidfVectorizer()
bb.fit(cv.fit_transform(X_train),y_train)
y_pred = bb.predict(cv.transform(X_test),y_test)



In [None]:
res = imdb_df_CV.kfoldCV(BernoulliBayes(),TfidfVectorizer())
print_acc_err(res)

INFO:root:2021-02-27 09:44:47 Starting CV 0/5 Test_Set[0:10000]
  return np.sum(pred == test) / test.shape[0]


ValueError: operands could not be broadcast together with shapes (10000,) (10000,2) 

In [6]:
twenty_news_df = pd.read_csv("dataset/twenty_news_row_array_bigram.csv")

twenty_news_df = shuffle(twenty_news_df, random_state=1)
twenty_news_df["sentence"] = twenty_news_df["sentence"].apply(
    lambda x: " ".join(eval(x))
)

twenty_news_df_X = twenty_news_df["sentence"]
twenty_news_df_y = twenty_news_df["target"]

twenty_CV = CrossVal(twenty_news_df_X, twenty_news_df_y)

# print(len(twenty_news_df_X))
# print(len(twenty_news_df_y))

## Naive Bayes

### IMDB

#### Sk-learn metrics

In [None]:
#  __name__ get name of the class

In [None]:
res = imdb_df.kfoldCV(BernoulliBayes(), TfidfVectorizer())
print_acc_err(res)

AttributeError: 'DataFrame' object has no attribute 'kfoldCV'

In [None]:
res = imdb_df_CV.kfoldCV(MultiNomialBayes(), TfidfVectorizer())
print_acc_err(res)

INFO:root:2021-02-27 09:28:55 Starting CV 0/5 Test_Set[0:10000]
[19939, 20061]


TypeError: predict() missing 1 required positional argument: 'validationLabels'

In [None]:
for i in [0.2, 0.4, 0.6, 0.8]:
    res = imdb_df_CV.kfoldCV_custom_size(MultinomialNB(), CountVectorizer(), i)
    print_acc_err(res)

INFO:root:2021-02-22 04:53:37 Starting CV 0/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 04:53:48 Starting CV 1/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 04:53:59 Starting CV 2/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 04:54:10 Starting CV 3/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 04:54:21 Starting CV 4/5 Train=0.2, Test=0.8
INFO:root:ACC: avg: 0.858985, [0.857125, 0.861, 0.858425, 0.856925, 0.86145]
INFO:root:ERR: avg: 0.141015, [0.142875, 0.139, 0.141575, 0.143075, 0.13855]
INFO:root:2021-02-22 04:54:33 Starting CV 0/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 04:54:43 Starting CV 1/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 04:54:54 Starting CV 2/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 04:55:05 Starting CV 3/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 04:55:16 Starting CV 4/5 Train=0.4, Test=0.6


KeyboardInterrupt: 

In [None]:
res = imdb_df_CV.kfoldCV(MultinomialNB(), TfidfVectorizer())
print_acc_err(res)

INFO:root:2021-02-22 00:17:56 Starting CV 0/5 Test_Set[0:10000]
INFO:root:2021-02-22 00:18:07 Starting CV 1/5 Test_Set[10000:20000]
INFO:root:2021-02-22 00:18:19 Starting CV 2/5 Test_Set[20000:30000]
INFO:root:2021-02-22 00:18:30 Starting CV 3/5 Test_Set[30000:40000]
INFO:root:2021-02-22 00:18:42 Starting CV 4/5 Test_Set[40000:50000]
INFO:root:ACC: avg: 0.8781599999999999, [0.8753, 0.8783, 0.8801, 0.8792, 0.8779]
INFO:root:ERR: avg: 0.12184, [0.1247, 0.1217, 0.1199, 0.1208, 0.1221]


In [None]:
for i in [0.2, 0.4, 0.6, 0.8]:
    res = imdb_df_CV.kfoldCV_custom_size(MultinomialNB(), TfidfVectorizer(), i)
    print_acc_err(res)

INFO:root:2021-02-22 00:18:53 Starting CV 0/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:19:05 Starting CV 1/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:19:17 Starting CV 2/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:19:29 Starting CV 3/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:19:41 Starting CV 4/5 Train=0.2, Test=0.8
INFO:root:ACC: avg: 0.86913, [0.8675, 0.865075, 0.8729, 0.8704, 0.869775]
INFO:root:ERR: avg: 0.13087, [0.1325, 0.134925, 0.1271, 0.1296, 0.130225]
INFO:root:2021-02-22 00:19:53 Starting CV 0/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:20:04 Starting CV 1/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:20:16 Starting CV 2/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:20:28 Starting CV 3/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:20:39 Starting CV 4/5 Train=0.4, Test=0.6
INFO:root:ACC: avg: 0.8738933333333333, [0.8753333333333333, 0.8750666666666667, 0.8735333333333334, 0.8688666666666667, 0.8766666666666667]
INFO:root:ERR: avg: 0.12610666666666667, [0.1246666

#### Self implemented

In [None]:
# todo

### Twenty News Group

#### Sk-learn metrics

In [None]:
res = twenty_CV.kfoldCV(MultinomialNB(), CountVectorizer())
print_acc_err(res)

INFO:root:2021-02-22 00:22:47 Starting CV 0/5 Test_Set[0:3770]
INFO:root:2021-02-22 00:22:51 Starting CV 1/5 Test_Set[3770:7540]
INFO:root:2021-02-22 00:22:55 Starting CV 2/5 Test_Set[7540:11310]
INFO:root:2021-02-22 00:22:59 Starting CV 3/5 Test_Set[11310:15080]
INFO:root:2021-02-22 00:23:03 Starting CV 4/5 Test_Set[15080:18850]
INFO:root:ACC: avg: 0.6563745701804925, [0.6445623342175066, 0.6604774535809018, 0.6517241379310345, 0.659946949602122, 0.6651619755708975]
INFO:root:ERR: avg: 18.061050499302006, [18.049071618037136, 18.614854111405837, 18.572944297082227, 17.367904509283818, 17.700477960701008]


In [None]:
for i in [0.2, 0.4, 0.6, 0.8]:
    res = twenty_CV.kfoldCV_custom_size(MultinomialNB(), CountVectorizer(), i)
    print_acc_err(res)

INFO:root:2021-02-22 00:23:07 Starting CV 0/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:23:10 Starting CV 1/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:23:14 Starting CV 2/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:23:17 Starting CV 3/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:23:21 Starting CV 4/5 Train=0.2, Test=0.8
INFO:root:ACC: avg: 0.5381839888571998, [0.5194004112223918, 0.5582675598593885, 0.5383033760031837, 0.5340584997015322, 0.5408900974995026]
INFO:root:ERR: avg: 23.39137759501227, [24.579624593752072, 22.45532930954434, 24.0358824699874, 23.033428400875504, 22.852623200902038]
INFO:root:2021-02-22 00:23:24 Starting CV 0/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:23:28 Starting CV 1/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:23:32 Starting CV 2/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:23:36 Starting CV 3/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:23:40 Starting CV 4/5 Train=0.4, Test=0.6
INFO:root:ACC: avg: 0.6083480721613017, [0.626989741775734, 0.

In [None]:
res = twenty_CV.kfoldCV(MultinomialNB(), TfidfVectorizer())
print_acc_err(res)

INFO:root:2021-02-22 00:24:24 Starting CV 0/5 Test_Set[0:3770]
INFO:root:2021-02-22 00:24:29 Starting CV 1/5 Test_Set[3770:7540]
INFO:root:2021-02-22 00:24:33 Starting CV 2/5 Test_Set[7540:11310]
INFO:root:2021-02-22 00:24:37 Starting CV 3/5 Test_Set[11310:15080]
INFO:root:2021-02-22 00:24:42 Starting CV 4/5 Test_Set[15080:18850]
INFO:root:ACC: avg: 0.6809959838904845, [0.6859416445623342, 0.6710875331564987, 0.6625994694960212, 0.6917771883289124, 0.6935740839086564]
INFO:root:ERR: avg: 16.601933705315325, [14.774801061007958, 19.115649867374007, 18.912997347480108, 15.093633952254642, 15.112586298459904]


In [None]:
for i in [0.2, 0.4, 0.6, 0.8]:
    res = twenty_CV.kfoldCV_custom_size(MultinomialNB(), TfidfVectorizer(), i)
    print_acc_err(res)

INFO:root:2021-02-22 00:24:46 Starting CV 0/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:24:50 Starting CV 1/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:24:53 Starting CV 2/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:24:57 Starting CV 3/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:25:01 Starting CV 4/5 Train=0.2, Test=0.8
INFO:root:ACC: avg: 0.5966969556277774, [0.6041652848709955, 0.6003846919148371, 0.5824102938250315, 0.5936857465012934, 0.6028387610267295]
INFO:root:ERR: avg: 19.7643032433508, [21.16495324003449, 18.58539497247463, 20.756980831730452, 18.70703720899383, 19.607149963520595]
INFO:root:2021-02-22 00:25:04 Starting CV 0/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:25:08 Starting CV 1/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:25:12 Starting CV 2/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:25:16 Starting CV 3/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:25:20 Starting CV 4/5 Train=0.4, Test=0.6
INFO:root:ACC: avg: 0.6386982667138309, [0.6391050583657587, 0.

#### Self implemented

In [None]:
# todo

## Logistic Regiossion with Sk-Learn

TODOs:
* test with different parameters
* small **tol** takes forever to converge
* docs can be found [here](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
* https://stackoverflow.com/questions/20894671/speeding-up-sklearn-logistic-regression

### IMDB

In [None]:
# see docs for args for LR

# tol @ 1e-4 doesnt converge well
res = imdb_df_CV.kfoldCV(
    LogisticRegression(solver="newton-cg", max_iter=1000, n_jobs=4, tol=0.01),
    CountVectorizer(),
)
print_acc_err(res)

INFO:root:2021-02-22 00:26:04 Starting CV 0/5 Test_Set[0:10000]
INFO:root:2021-02-22 00:26:38 Starting CV 1/5 Test_Set[10000:20000]
INFO:root:2021-02-22 00:27:12 Starting CV 2/5 Test_Set[20000:30000]
INFO:root:2021-02-22 00:27:45 Starting CV 3/5 Test_Set[30000:40000]
INFO:root:2021-02-22 00:28:14 Starting CV 4/5 Test_Set[40000:50000]
INFO:root:ACC: avg: 0.9011, [0.9001, 0.9048, 0.9023, 0.8956, 0.9027]
INFO:root:ERR: avg: 0.0989, [0.0999, 0.0952, 0.0977, 0.1044, 0.0973]


In [None]:
for i in [0.2, 0.4, 0.6, 0.8]:
    res = imdb_df_CV.kfoldCV_custom_size(
        LogisticRegression(solver="newton-cg", max_iter=1000, n_jobs=4, tol=0.01),
        CountVectorizer(), i,
    )
    print_acc_err(res)

INFO:root:2021-02-22 00:28:46 Starting CV 0/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:28:59 Starting CV 1/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:29:13 Starting CV 2/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:29:26 Starting CV 3/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:29:40 Starting CV 4/5 Train=0.2, Test=0.8
INFO:root:ACC: avg: 0.878375, [0.87785, 0.877775, 0.8798, 0.876775, 0.879675]
INFO:root:ERR: avg: 0.121625, [0.12215, 0.122225, 0.1202, 0.123225, 0.120325]
INFO:root:2021-02-22 00:29:54 Starting CV 0/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:30:13 Starting CV 1/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:30:34 Starting CV 2/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:30:52 Starting CV 3/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:31:10 Starting CV 4/5 Train=0.4, Test=0.6
INFO:root:ACC: avg: 0.8894733333333333, [0.8911666666666667, 0.8880333333333333, 0.8864, 0.8912333333333333, 0.8905333333333333]
INFO:root:ERR: avg: 0.11052666666666668, [0.10883333333

In [None]:
# at max_itr = 1000, it does not converge
res = imdb_df_CV.kfoldCV(
    LogisticRegression(solver="newton-cg", max_iter=1000, n_jobs=4, tol=0.01),
    TfidfVectorizer(),
)
print_acc_err(res)

INFO:root:2021-02-22 00:36:26 Starting CV 0/5 Test_Set[0:10000]
INFO:root:2021-02-22 00:36:42 Starting CV 1/5 Test_Set[10000:20000]
INFO:root:2021-02-22 00:36:58 Starting CV 2/5 Test_Set[20000:30000]
INFO:root:2021-02-22 00:37:14 Starting CV 3/5 Test_Set[30000:40000]
INFO:root:2021-02-22 00:37:30 Starting CV 4/5 Test_Set[40000:50000]
INFO:root:ACC: avg: 0.90436, [0.8995, 0.9079, 0.9047, 0.9029, 0.9068]
INFO:root:ERR: avg: 0.09564, [0.1005, 0.0921, 0.0953, 0.0971, 0.0932]


In [None]:
for i in [0.2, 0.4, 0.6, 0.8]:
    res = imdb_df_CV.kfoldCV_custom_size(
        LogisticRegression(solver="newton-cg", max_iter=1000, n_jobs=4, tol=0.01),
        TfidfVectorizer(),
        i,
    )
    print_acc_err(res)

INFO:root:2021-02-22 00:37:46 Starting CV 0/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:37:58 Starting CV 1/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:38:10 Starting CV 2/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:38:22 Starting CV 3/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:38:34 Starting CV 4/5 Train=0.2, Test=0.8
INFO:root:ACC: avg: 0.882495, [0.8838, 0.88295, 0.882025, 0.8811, 0.8826]
INFO:root:ERR: avg: 0.117505, [0.1162, 0.11705, 0.117975, 0.1189, 0.1174]
INFO:root:2021-02-22 00:38:46 Starting CV 0/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:39:00 Starting CV 1/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:39:13 Starting CV 2/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:39:26 Starting CV 3/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:39:39 Starting CV 4/5 Train=0.4, Test=0.6
INFO:root:ACC: avg: 0.89484, [0.8954333333333333, 0.8939333333333334, 0.8958333333333334, 0.8944, 0.8946]
INFO:root:ERR: avg: 0.10516, [0.10456666666666667, 0.10606666666666667, 0.104166666666

### Twenty News Group

In [None]:
# tol=1e-4 takes forever to converge soooo don't do it :)
# takes forever to train, uncomment when ready

# res = twenty_CV.kfoldCV(
#     LogisticRegression(solver="newton-cg", n_jobs=4, tol=0.1), CountVectorizer()
# )
# print_acc_err(res)

In [None]:
res = twenty_CV.kfoldCV(
    LogisticRegression(solver="newton-cg", max_iter=1000, n_jobs=4), TfidfVectorizer()
)
print_acc_err(res)

INFO:root:2021-02-22 00:42:32 Starting CV 0/5 Test_Set[0:3770]
INFO:root:2021-02-22 00:43:46 Starting CV 1/5 Test_Set[3770:7540]
INFO:root:2021-02-22 00:44:46 Starting CV 2/5 Test_Set[7540:11310]
INFO:root:2021-02-22 00:46:06 Starting CV 3/5 Test_Set[11310:15080]
INFO:root:2021-02-22 00:47:11 Starting CV 4/5 Test_Set[15080:18850]
INFO:root:ACC: avg: 0.7331556253002222, [0.7381962864721485, 0.7244031830238726, 0.7281167108753316, 0.7291777188328913, 0.7458842272968667]
INFO:root:ERR: avg: 14.847590855497534, [14.191511936339522, 15.988063660477454, 15.856233421750662, 14.430238726790451, 13.77190653212958]


In [None]:
for i in [0.2, 0.4, 0.6, 0.8]:
    res = twenty_CV.kfoldCV_custom_size(
        LogisticRegression(solver="newton-cg", n_jobs=4, tol=0.1), TfidfVectorizer(), i
    )
    print_acc_err(res)

INFO:root:2021-02-22 00:48:11 Starting CV 0/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:48:21 Starting CV 1/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:48:31 Starting CV 2/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:48:40 Starting CV 3/5 Train=0.2, Test=0.8
INFO:root:2021-02-22 00:48:50 Starting CV 4/5 Train=0.2, Test=0.8
INFO:root:ACC: avg: 0.6568946076805731, [0.6535782980699079, 0.6455528288120979, 0.6590170458313989, 0.6580884791404126, 0.6682363865490483]
INFO:root:ERR: avg: 17.288028122305498, [19.29879949592094, 16.615772368508324, 16.646945678848578, 16.59958877760828, 17.279034290641373]
INFO:root:2021-02-22 00:48:58 Starting CV 0/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:49:14 Starting CV 1/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:49:31 Starting CV 2/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:49:47 Starting CV 3/5 Train=0.4, Test=0.6
INFO:root:2021-02-22 00:50:06 Starting CV 4/5 Train=0.4, Test=0.6
INFO:root:ACC: avg: 0.7007251503360453, [0.6995932083480721,

# Code archives 😀

In [None]:
# batches = []

# for X, y in BatchTraining(imdb_test_X, imdb_test_y, vect):
#     y_predicted = nb.predict(X)
#     batches.append(evaluate_acc(y, y_predicted))

# sum(batches) / len(batches)

In [None]:
# imdb_train_X, imdb_test_X, imdb_train_y, imdb_test_y = train_test_split_df(imdb_df)

# vect = TfidfVectorizer(min_df=5)
# imdb_tfidf_vect = vect.fit_transform(imdb_train_X)


# NB_model = NaiveBayes()
# # Cross Validation Phase
# #crossed_dataset_split_x, crossed_dataset_split_y = CrossValidation.cross_validation_split(imdb_tfidf_vect, imdb_train_y)

# err_valid = CrossValidation.kfoldCV(imdb_tfidf_vect, imdb_train_y, NB_model)
# print(err_valid)

In [None]:
# loss = lambda y, yh: np.mean((y - yh) ** 2)


# class CrossValidation:
#     def __init__(self):
#         return

#     # That 'probably' more efficient method
#     def cross_validation_split(dataset_x, n_folds=3):
#         n = len(dataset_x)
#         n_val = n // n_folds
#         inds = np.random.permutation(n)
#         inds = []
#         for f in range(n_folds):
#             tr_inds = []
#             # get the validation indexes
#             val_inds = list(range(f * n_val, (f + 1) * n_val))
#             # get the train indexes
#             if f > 0:
#                 tr_inds = list(range(f * n_val))
#             if f < n_folds - 1:
#                 tr_inds = tr_inds + list(range((f + 1) * n_val, n))
#             # The yield statement suspends function’s execution and sends a value back to the caller
#             # but retains enough state information to enable function to resume where it is left off
#             yield tr_inds, val_inds

#     # Self-implemented
#     def kfoldCV(dataset_x, dataset_y, model, folds=3):
#         err_valid = np.zeros(folds)
#         for f, (tr, val) in enumerate(cross_validate(dataset_x, folds)):
#             model = model.fit(np.array(dataset_x[tr]), np.array(dataset_y[tr]))
#             err_valid[f] = loss(
#                 np.array(dataset_y[val]), model.predict(np.array(dataset_x[val]))
#             )

#     def repeat(self):
#         pass

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f5e36ec1-5982-458d-86cb-de064c0212ca' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>