In [1]:
%matplotlib inline


# Semi-supervised Classification on a Text Dataset

In this example, semi-supervised classifiers are trained on the 20 newsgroups
dataset (which will be automatically downloaded).

You can adjust the number of categories by giving their names to the dataset
loader or setting them to `None` to get all 20 of them.


In [2]:
import os

import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import f1_score

In [32]:
data = fetch_20newsgroups(subset='train', categories=None)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

11314 documents
20 categories



In [4]:
# Parameters
sdg_params = dict(alpha=1e-5, penalty='l2', loss='log')
vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)

In [168]:
# Supervised Pipeline
pipeline = Pipeline([
    #('vect', CountVectorizer(**vectorizer_params)),
    #('tfidf', TfidfTransformer()),
    #('clf', SGDClassifier(**sdg_params)),
    ('clf', MultinomialNB()),

])
# SelfTraining Pipeline
st_pipeline = Pipeline([
    #('vect', CountVectorizer(**vectorizer_params)),
    #('tfidf', TfidfTransformer()),
    #('clf', SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),
    ('clf', SelfTrainingClassifier(MultinomialNB(), verbose=True)),
])
# LabelSpreading Pipeline
ls_pipeline = Pipeline([
    #('vect', CountVectorizer(**vectorizer_params)),
    #('tfidf', TfidfTransformer()),
    # LabelSpreading does not support dense matrices
    ('todense', FunctionTransformer(lambda x: x.todense())),
    ('clf', LabelSpreading()),
])

In [195]:
def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
    print("Number of training samples:", len(X_train))
    print("Unlabeled samples in training set:",
          sum(1 for x in y_train if x == -1))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("Accuracy score on test set: "
          "%0.3f" % accuracy_score(y_test, y_pred))
    print("-" * 10)
    print()

In [110]:
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
if __name__ == "__main__":
    
    print("Supervised SGDClassifier on 100% of the data:")
    eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)

    # select a mask of 20% of the train dataset
    y_mask = np.random.rand(len(y_train)) < 0.2

    # X_20 and y_20 are the subset of the train dataset indicated by the mask
    X_20, y_20 = map(list, zip(*((x, y)
                     for x, y, m in zip(X_train, y_train, y_mask) if m)))
    print("Supervised SGDClassifier on 20% of the training data:")
    eval_and_print_metrics(pipeline, X_20, y_20, X_test, y_test)

    # set the non-masked subset to be unlabeled
    y_train[~y_mask] = -1
    print("SelfTrainingClassifier on 20% of the training data (rest "
          "is unlabeled):")
    eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test)

    if 'CI' not in os.environ:
        # LabelSpreading takes too long to run in the online documentation
        print("LabelSpreading on 20% of the data (rest is unlabeled):")
        eval_and_print_metrics(ls_pipeline, X_train, y_train, X_test, y_test)

Supervised SGDClassifier on 100% of the data:
Number of training samples: 8485
Unlabeled samples in training set: 0
Micro-averaged F1 score on test set: 0.909
----------

Supervised SGDClassifier on 20% of the training data:
Number of training samples: 1682
Unlabeled samples in training set: 0
Micro-averaged F1 score on test set: 0.792
----------

SelfTrainingClassifier on 20% of the training data (rest is unlabeled):
Number of training samples: 8485
Unlabeled samples in training set: 6803
End of iteration 1, added 2844 new labels.
End of iteration 2, added 692 new labels.
End of iteration 3, added 216 new labels.
End of iteration 4, added 73 new labels.
End of iteration 5, added 38 new labels.
End of iteration 6, added 16 new labels.
End of iteration 7, added 8 new labels.
End of iteration 8, added 5 new labels.
End of iteration 9, added 6 new labels.
End of iteration 10, added 8 new labels.
Micro-averaged F1 score on test set: 0.836
----------

LabelSpreading on 20% of the data (rest

In [147]:
# type(y_test[1])

numpy.ndarray

list

In [230]:
import pandas as pd
import numpy as np

os.chdir("/Users/cornelius/Desktop/SCRIPTS/GitHub/scripts-issue-agendas")
os.getcwd()

# Load press release data (training and test)
X_train = pd.read_csv("semi-files/X_train.csv", index_col = 0).values
y_train = np.asarray([int(i) for i in pd.read_csv("semi-files/y_train.csv", 
index_col = 0).values])

X_test = pd.read_csv("semi-files/X_test.csv", index_col = 0).values
y_test = np.asarray([int(i) for i in pd.read_csv("semi-files/y_test.csv", 
index_col = 0).values])

In [231]:
len(y_train[(y_train != -1)])

2232

In [232]:
len(X_train[(y_train != -1)])

2232

In [273]:
NBmod = MultinomialNB()
NBmod.fit(X_train[(y_train != -1)][:1000], y_train[(y_train != -1)][:1000])

In [276]:
y_train_new = y_train
y_train_new[(y_train != -1)][1001:] = NBmod.predict(X_train[(y_train != -1)][1001:])

In [273]:
NBmod2 = MultinomialNB()
NBmod2.fit(X_train[(y_train_new != -1)], y_train[(y_train_new != -1)])

X_train[(y_train != -1)][1001:]

MultinomialNB()

In [274]:
accuracy_score(NBmod.predict(X_test), y_test)

0.5374531835205992

In [263]:
import numpy as np
from sklearn import datasets
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import ComplementNB


In [243]:
svc = SVC(probability=True, gamma="auto")

self_training_model = SelfTrainingClassifier(svc, verbose = True)

self_training_model.fit(X_train, y_train)

SelfTrainingClassifier(base_estimator=SVC(gamma='auto', probability=True))

In [245]:
accuracy_score(self_training_model.predict(X_test), y_test)

In [None]:
y_test

In [264]:
svc = ComplementNB()

self_training_model_gnb = SelfTrainingClassifier(svc, verbose = True)

self_training_model_gnb.fit(X_train, y_train)

End of iteration 1, added 817 new labels.
End of iteration 2, added 116 new labels.
End of iteration 3, added 14 new labels.
End of iteration 4, added 2 new labels.


SelfTrainingClassifier(base_estimator=ComplementNB(), verbose=True)

In [265]:
accuracy_score(self_training_model_gnb.predict(X_test), y_test)

0.6235955056179775

In [271]:
svc = MultinomialNB()

self_training_model_mnb = SelfTrainingClassifier(svc, verbose = True)

self_training_model_mnb.fit(X_train[y_train != -1][:1000], y_train[y_train != -1][:1000])






SelfTrainingClassifier(base_estimator=MultinomialNB(), verbose=True)

In [256]:
svc = SGDClassifier(alpha=1e-5, penalty='l2', loss='log')

self_training_model_sdg = SelfTrainingClassifier(svc, verbose = True)

self_training_model_sdg.fit(X_train, y_train)


End of iteration 1, added 1369 new labels.
End of iteration 2, added 85 new labels.
End of iteration 3, added 17 new labels.
End of iteration 4, added 3 new labels.
End of iteration 5, added 4 new labels.
End of iteration 6, added 2 new labels.
End of iteration 7, added 1 new labels.
End of iteration 8, added 3 new labels.
End of iteration 9, added 3 new labels.
End of iteration 10, added 2 new labels.


SelfTrainingClassifier(base_estimator=SGDClassifier(alpha=1e-05, loss='log'),
                       verbose=True)

In [257]:
accuracy_score(self_training_model_sdg.predict(X_test), y_test)

0.5299625468164794

In [237]:
len(y_train[y_train!= -1])

2232

In [None]:
if __name__ == "__main__":
    # X, y = data.data, data.target
    # X_train, X_test, y_train, y_test = train_test_split(X, y)

    print("Supervised SGDClassifier on 100% of the data:")
    eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)

    # select a mask of 20% of the train dataset
    y_mask = np.random.rand(len(y_train)) < 0.2

    # X_20 and y_20 are the subset of the train dataset indicated by the mask
    X_20, y_20 = map(list, zip(*((x, y)
                     for x, y, m in zip(X_train, y_train, y_mask) if m)))
    print("Supervised SGDClassifier on 20% of the training data:")
    eval_and_print_metrics(pipeline, X_20, y_20, X_test, y_test)

    # set the non-masked subset to be unlabeled
    y_train[~y_mask] = -1
    print("SelfTrainingClassifier on 20% of the training data (rest "
          "is unlabeled):")
    eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test)

    if 'CI' not in os.environ:
        # LabelSpreading takes too long to run in the online documentation
        print("LabelSpreading on 20% of the data (rest is unlabeled):")
        eval_and_print_metrics(ls_pipeline, X_train, y_train, X_test, y_test)

In [198]:
NBmodSS = MultinomialNBSS()
NBmodSS.fit(X_train,y_train)

Step 1: jll = -44409836.144086
Step 2: jll = -44592354.743390
Step 3: jll = -44601743.852176
Step 4: jll = -44606733.320179
Step 5: jll = -44611595.432071
Step 6: jll = -44611439.174188
Optimization converged after 6 iterations.


MultinomialNBSS()

In [199]:
accuracy_score(NBmodSS.predict(X_test), y_test)

In [202]:
NBmodSS.predict(X_test)

array([ 10,  12,  16,   5,   5,  15,  10,  10,   7,  99,   2,   2,   9,
        15,  15,   6,   7,   7,   1,  10,  99,  15,   5,   7, 191, 191,
         7,  12,   7,  12,   6,   4,  12,   7,   1,  15,   6,  12,   7,
        15,   7,   5,   5,  12,   7,  99,  10, 191,   6,   6,   7,  12,
        12,  12,   7,   5,   5,   4,  15,  12,  10,  15,   1,   4,   6,
        99,  15,   6,  10,   7,   7,  15,   7,   1, 191, 191,   6,   7,
       192,   5,  15, 191,   7,  10,   5,  12,   5, 191,   1,   2,   6,
         5,   4,   5,  12, 191, 191,  15,   4, 191,  15,   7, 191,   5,
       191,   7,   2,  10,   7, 191,   4,  12,  12,   5,   4,  15,  12,
        99,   5,  99,   4, 191,   7,  15,   5, 191,   7,  10,  12,  12,
        15, 191,   6,   5,  12,  15,   7,   7,   1,  16, 191,  15,   1,
         2, 191, 191,  12, 191,   7,   7,  10,   7,   7,   5,   7, 191,
         7,  15,   7,  10,   7,  12,  15,  16,   5,  12,   5,  12,   7,
         7,  12,   6,  10,  10,  12,   5,   4,  10,   4,  12,  1

In [201]:
y_test

array([  3,  12,  16,  10,   5, 192,   6,   6,   4,  99,   2,   2,   9,
        15, 192,   6,  10,   7,   1,  10,   1,   1,   5,  10, 191, 191,
         1,  12,   7,  17,   6,   4,  12, 191,   1,  10,  15,   9, 191,
        15,   4,   1,   1,  20,   7,   2,   2,  16,   6,   3,   4,   9,
        12,  12,   7,   1,  10,   4,  10,  12,   9,   1,   1,   4,   6,
        12,  15,  12,   9,  15,   4,   1,  10,   1, 191, 191,  99,   7,
         7,   1,   3, 191,  10,  10,  10,  12,   2, 191,   1, 191,  17,
         5,   3,  10,  12,  12, 191,  15,   7, 191, 192,   6, 192,  15,
       191,  17,  12,   3,  20,  16,   4,  20,   9,   5,   7,   1,   9,
       192,   5,  12,   4, 192,   7, 191,   2,  20,   4,  10,  12,  12,
         1, 191,   6,   3,  12, 192,   7,  10,  10,  16, 191, 192,   1,
        12, 191, 191, 192, 191, 191,  15,   6,  15,  15,  10,   3, 191,
        15,  12,  15,   6,  15,  12,  20, 191,   3,  20,   5,  12,   7,
         7,  17,  10,   6,  10,   9,   5,   4,   3,   3,   2,  1

In [None]:
from itertools import compress

# Remove unlabeled
X_train = list(compress(X_train, (y_train != -1))) 
y_train = y_train[(y_train != -1)]

In [None]:
X_train = np.ndarray.tolist(X_train)
X_train = [item for sublist in X_train for item in sublist]
X_test = np.ndarray.tolist(X_test)
X_test = [item for sublist in X_test for item in sublist]

In [None]:
from sklearn.naive_bayes import MultinomialNB # 1
from sklearn.metrics import accuracy_score # Load sklearn tools

# Load press release data (training and test)
X_train = pd.read_csv("semi-files/X_train.csv", index_col = 0).values
y_train = np.asarray([int(i) for i in pd.read_csv("semi-files/y_train.csv", 
index_col = 0).values])

X_test = pd.read_csv("semi-files/X_test.csv", index_col = 0).values
y_test = np.asarray([int(i) for i in pd.read_csv("semi-files/y_test.csv", 
index_col = 0).values])

In [268]:
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 19 04:51:16 2019

@author: kennedy
"""

from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import re
import codecs
import os
import numpy as np
from mimetypes import guess_type
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import itertools
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

class docparser(object):
    """
    parser to get the Stsa dataset
    """
    def __init__(self):
        pass
    def transform_label_to_numeric(self, y):
            if '1' in y:
                return 1
            else:
                return 0
    def parse_line(self, row):
        row = row.split(' ')
        text = (' '.join(row[1:]))
        label = self.transform_label_to_numeric(row[0])
        return (re.sub(r'\W+', ' ', text), label)

    def get_data(self, file_path, Text = True):
        if Text:
          data = []
          labels = []
          f = codecs.open(file_path, 'r', encoding = "utf8",errors = 'ignore')
          for line in f:
              doc, label = self.parse_line(line)
              data.append(doc)
              labels.append(label)
          return data, np.array(labels)
        else:
          import pandas as pd
          df = pd.read_csv(file_path)
          df.columns = ['Label', 'Rating', 'Review']
          rating = pd.get_dummies(df.loc[:, 'Rating'])
          text = []
          labels = []
          for ii, ij in zip(df.loc[:, 'Review'].values, df.loc[:, 'Label'].values):
            #print(ii, ij)
            if ii == ' ':
              pass
            else:
              text.append(ii)
              labels.append(ij)
          return text, np.array(labels), rating
  

    def shuffle_dataset(self, X, y, seed=None):
            """ Random shuffle of the samples 
            in X and y 
            """
            if seed:
                np.random.seed(seed)
            idx = [ii for ii in range(X.shape[0])]
            np.random.shuffle(idx)
            return X[idx], y[idx]
      
    def split(self, data, labels, test_size = 0.3,shuff = True, seed = None):
        '''
        :params
          --label: labels of the dataset
          --rating: rating as dummies categorical variables
          --text: text data
        :Returntype:
           X_supervised:
           X_unsupervised:
           y_supervised:
        '''
        X = np.array(data)
        y = np.array(labels)
        if shuff:
          X, y = self.shuffle_dataset(X, y, seed)
        else:
          split = len(y) - int(len(y) // (1/test_size))
          X_supvsd, X_unsupvsd = X[:split], X[split:]
          y_supvsd, y_supvsd = y[:split], y[split:]
          return X_supvsd, X_unsupvsd, y_supvsd, y_supvsd
      
 #%% Semisupervised NB Classifier
 
class NaiveBayesSemiSupervised(object):
    """
    This class implements a modification of the Naive Bayes classifier
    in order to deal with unlabelled data. We use an Expectation-maximization 
    algorithm (EM). 
    This work is based on the paper
    'Semi-Supervised Text Classification Using EM' by
    Kamal Nigam Andrew McCallum Tom Mitchell
    available here:
    https://www.cs.cmu.edu/~tom/pubs/NigamEtAl-bookChapter.pdf
    """
    def __init__(self, max_features=None, max_rounds=50, tolerance=1e-6):
        """
        constructor for NaiveBayesSemiSupervised object
        keyword arguments:
            -- max_features: maximum number of features for documents vectorization
            -- max_rounds: maximum number of iterations for EM algorithm
            -- tolerance: threshold (in percentage) for total log-likelihood improvement during EM
        """
        self.max_features = max_features
        self.n_labels = 0
        self.max_rounds = max_rounds
        self.tolerance = tolerance
          
          
    def train(self, X_supervised, X_unsupervised, y_supervised, y_unsupervised):
        """
        train the modified Naive bayes classifier using both labelled and 
        unlabelled data. We use the CountVectorizer vectorizaton method from scikit-learn
        positional arguments:
            -- X_supervised: list of documents (string objects). these documents have labels
                example: ["all parrots are interesting", "some parrots are green", "some parrots can talk"]
            -- X_unsupervised: list of documents (string objects) as X_supervised, but without labels
            -- y_supervised: labels of the X_supervised documents. list or numpy array of integers. 
                example: [2, 0, 1, 0, 1, ..., 0, 2]
            -- X_supervised, X_unsupervised, y_supervised, y_unsupervised
        """
        count_vec = CountVectorizer(max_features = self.max_features)
        count_vec.fit(X_supervised)
        self.n_labels = len(set(y_supervised))
        if self.max_features is None:
            self.max_features = len(count_vec.vocabulary_)
        X_supervised = np.asarray(count_vec.transform(X_supervised).todense())
        X_unsupervised = np.asarray(count_vec.transform(X_unsupervised).todense())
        #train Naive Bayes
        self.train_naive_bayes(X_supervised, y_supervised)
        predi = self.predict(X_supervised)
        old_likelihood = 1
        final_accuracy = 0
        while self.max_rounds > 0:
            self.max_rounds -= 1
            predi = self.predict(X_unsupervised)
            self.train_naive_bayes(X_unsupervised, predi)
            predi = self.predict(X_unsupervised)
            correct = 0
            for ij in predi:
              if ij == 1:
                correct += 1
            correct_percent = correct/len(X_unsupervised)
            if correct_percent > 0:
              final_accuracy = correct_percent
            print(str(correct_percent) + "%")
            total_likelihood = self.get_log_likelihood( X_supervised, X_unsupervised, y_supervised)
            print("total likelihood: {}".format(total_likelihood))
            if self._stopping_time(old_likelihood, total_likelihood):
                print('Log likelihood not improved..Stopping EM at %s'%self.max_rounds)
                break
            old_likelihood = total_likelihood.copy()
        return final_accuracy
            
    def _stopping_time(self, old_likelihood, new_likelihood):
        """
        returns True if there is no significant improvement in log-likelihood and false else
        positional arguments:
            -- old_likelihood: log-likelihood for previous iteration
            -- new_likelihood: new log-likelihood
        """
        relative_change = np.absolute((new_likelihood-old_likelihood)/old_likelihood) 
        if (relative_change < self.tolerance):
            print("stopping time")
            return True
        else:
            return False
          
    def get_log_likelihood(self, X_supervised, X_unsupervised, y_supervised):
        """
        returns the total log-likelihood of the model, taking into account unsupervised data
        positional arguments:
            -- X_supervised: list of documents (string objects). these documents have labels
                example: ["all parrots are interesting", "some parrots are green", "some parrots can talk"]
            -- X_unsupervised: list of documents (string objects) as X_supervised, but without labels
            -- y_supervised: labels of the X_supervised documents. list or numpy array of integers. 
                example: [2, 0, 1, 0, 1, ..., 0, 2]
        """
        unsupervised_term = np.sum(self._predict_proba_unormalized(X_unsupervised), axis=1)
        unsupervised_term = np.sum(np.log(unsupervised_term))
        supervised_term = self._predict_proba_unormalized(X_supervised)
        supervised_term = np.take(supervised_term, y_supervised)
        supervised_term = np.sum(np.log(supervised_term))
        total_likelihood = supervised_term + unsupervised_term
        return total_likelihood

    def word_proba(self, X, y, c):
        """
        returns a numpy array of size max_features containing the conditional probability
        of each word given the label c and the model parameters
        positional arguments:
            -- X: data matrix, 2-dimensional numpy ndarray
            -- y: numpy array of labels, example: np.array([2, 0, 1, 0, 1, ..., 0, 2])
            -- c: integer, the class upon which we condition
        """
        numerator = 1 + np.sum( X[np.equal( y, c )], axis=0)
        denominator = self.max_features + np.sum( X[ np.equal( y, c)])
        return np.squeeze(numerator)/denominator

    def class_proba(self, X, y, c):
        """
        returns a numpy array of size n_labels containing the conditional probability
        of each label given the label model parameters
        positional arguments:
            -- X: data matrix, 2-dimensional numpy ndarray
            -- y: numpy array of labels, example: np.array([2, 0, 1, 0, 1, ..., 0, 2])
            -- c: integer, the class upon which we condition
        """
        numerator = 1 + np.sum( np.equal( y, c) , axis=0)
        denominator = X.shape[0] + self.n_labels
        return numerator/denominator

    def train_naive_bayes(self, X, y):
        """
        train a regular Naive Bayes classifier
        positional arguments:
             -- X: data matrix, 2-dimensional numpy ndarray
             -- y: numpy array of labels, example: np.array([2, 0, 1, 0, 1, ..., 0, 2])
        """
        word_proba_array = np.zeros((self.max_features, self.n_labels))
        for c in range(self.n_labels):
            word_proba_array[:,c] = self.word_proba( X, y, c)
        labels_proba_array = np.zeros(self.n_labels)
        for c in range(self.n_labels):
            labels_proba_array[c] = self.class_proba( X, y, c)
        self.word_proba_array = word_proba_array
        self.labels_proba_array = labels_proba_array

    def _predict_proba_unormalized(self, X_test):
        """
        returns unormalized predicted probabilities (useful for log-likelihood computation)
        positional arguments:
             -- X: data matrix, 2-dimensional numpy ndarray
        """
        proba_array_unormalized = np.zeros((X_test.shape[0], self.n_labels))
        for c in range(self.n_labels):
            temp = np.power(np.tile(self.word_proba_array[:,c], (X_test.shape[0] ,1)), X_test)
            proba_array_unormalized[:,c] = self.labels_proba_array[c] * np.prod(temp, axis=1)
        return proba_array_unormalized

    def predict_proba(self, X):
        """
        returns model predictions (probability)
        positional arguments:
             -- X: data matrix, 2-dimensional numpy ndarray
        """
        proba_array_unormalized = self._predict_proba_unormalized(X)
        proba_array = np.true_divide(proba_array_unormalized, np.sum(proba_array_unormalized, axis=1)[:, np.newaxis])
        return proba_array

    def predict(self, X):
        """
        returns model predictions (class labels)
        positional arguments:
             -- X: data matrix, 2-dimensional numpy ndarray
        """
        return np.argmax(self.predict_proba( X), axis=1)
      
if __name__ == '__main__':
    import os
    import random
    random.seed(23)
    from sklearn.model_selection import train_test_split
    #import naive_bayes
    #import em
    import nltk
    #---------Global parameters-----------------
    Text = False
    max_features=None
    NSPLIT = 4
    n_sets = 10
    set_size = 1.0 / n_sets
    cumulative_percent = 0
    #set project directory
    os.chdir('D:\\FREELANCER\\SEMI_NB_TEXT_CLASSIFICATION')
    file_path = os.path.join('DATASET','stas_train.text')
    #----------NaiveBayes with crossval
#    labeled_reviews = naive_bayes.get_labeled_reviews("D:\\FREELANCER\\SEMI_NB_TEXT_CLASSIFICATION\\DATASET\\mytracks_NaiveBayes_Filter.csv")
    if Text:
      pass
    else:
      data, labels, rating = docparser().get_data(os.path.join('DATASET','mytracks_NaiveBayes_Filter.csv'), Text)
      #----uncomment the line below to run both your NB and the semi-NB together
#      labeled_reviews = naive_bayes.get_labeled_reviews(os.path.join('DATASET','mytracks_NaiveBayes_Filter.csv'))
#      all_words = {}
#      for (r, label) in labeled_reviews:
#          for word in r.split(" "):
#              if len(word) > 1:
#                  all_words[word] = 0
#    #featureset for NB
#    featuresets = [(naive_bayes.review_features(r, all_words), label) for (r, label) in labeled_reviews]
#    print('Start Naive Bayes Classification')
#    naive_bayes.cross_validation(featuresets, n_sets)
    print('End of Naive Bayes Classiation\n')
    print('*'*40)
    print('Begin Semi NaiveBayes Classification')
    #validation
    _cum_acc = []
    for i in range(0, n_sets):
      n_training = int(set_size * len(data))
      split_start = i * n_training
      split_end = (i + 1) * n_training
      train_data_before = data[:split_start]
      train_data_after = data[split_end:]
      X_supervised = train_data_before + train_data_after
      X_unsupervised = data[split_start:split_end]
      #for labels
      train_labels_before = labels[:split_start]
      train_labels_after = labels[split_end:]
      y_supervised = list(train_labels_before) + list(train_labels_after)
      y_unsupervised = labels[split_start:split_end]
      #Max features should be left as it is 5738
      clf = NaiveBayesSemiSupervised(max_features)
      #train and evaluate accuracy
      rst = clf.train(X_supervised, X_unsupervised, np.array(y_supervised), y_unsupervised)
      _cum_acc.append(rst)
    print('*'*40)
    print('End of Semi-NaiveBayes')
    print('Final Accuracy after {}fold Cross-validation is: {}'.format(n_sets, np.mean(np.array(_cum_acc))))




      
      

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\FREELANCER\\SEMI_NB_TEXT_CLASSIFICATION'

In [267]:
import csv
import random
import nltk


def get_labeled_reviews(path_to_csv):
    labeled_reviews = []
    with open(path_to_csv, newline='', encoding='utf-8') as csvfile:
        review_reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(review_reader, None)  # Skip csv headers
        for row in review_reader:
            label = int(row[0])
            review_text = row[2]

            review = (review_text, label)
            labeled_reviews.append(review)

    return labeled_reviews


def review_features(review, all_words):
    #features = {}
    features = all_words.copy()
    #features["review"] = review
    for word in str.split(review, " "):
        if len(word) > 1:
            if word in features:
                features[word] += 1
            else:
                features[word] = 1
    return features

def cross_validation(all_data, n_sets):
    set_size = 1.0 / n_sets
    shuffled_data = all_data.copy()
    random.shuffle(shuffled_data)
    cumulative_percent = 0
    for i in range(0, 2):
        n_training = int(set_size * len(all_data))
        split_start = i * n_training
        split_end = (i + 1) * n_training
        print("train split_start: " + str(split_start) + " - split_end: " + str(split_end))
        train_data_before = shuffled_data[:split_start]
        train_data_after = shuffled_data[split_end:]
        train_data = train_data_before + train_data_after
        test_data = shuffled_data[split_start:split_end]
        print('{}\n{}\n{}'.format(train_data_before, train_data_after, train_data))
        # print("train size: " + str(len(train_data)) + " - test size: " + str(len(test_data)))
        classifier = nltk.NaiveBayesClassifier.train(train_data, nltk.LaplaceProbDist)
        correct = 0
        for i, (t, l) in enumerate(test_data):
            classified = classifier.classify(t)
            # actual = labeled_reviews[split_point:][i][1]
            if classified == l:
                correct += 1
        print(str(correct) + "/" + str(len(test_data)))
        correct_percent = correct/len(test_data)
        cumulative_percent += correct_percent
        print(str(correct_percent) + "%")
    print("Average result: " + str(cumulative_percent / n_sets) + "%")






