In [22]:
import pandas as pd
import numpy as np
import time

from scipy.linalg.blas import ddot ,  dscal , daxpy
from scipy.stats import chi2_contingency
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator

In [23]:
## For local directory
import os
os.chdir('/Users/andreasnilsson/Desktop/Master DS/5. Applied Machine Learning/A 4')

In [24]:
# This function reads the corpus, returns a list of documents, and a list
# of their corresponding polarity labels. 
def read_data(corpus_file):
    X = []
    Y = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            _, y, _, x = line.split(maxsplit=3)
            X.append(x.strip())
            Y.append(y)
    return X, Y

# Read all the documents.
X, Y = read_data('all_sentiment_shuffled.txt')

# Split into training and test parts.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2,
                                                random_state=0)

In [25]:


class LinearClassifier(BaseEstimator):
    """
    General class for binary linear classifiers. Implements the predict
    function, which is the same for all binary linear classifiers. There are
    also two utility functions.
    """

    def decision_function(self, X):
        """
        Computes the decision function for the inputs X. The inputs are assumed to be
        stored in a matrix, where each row contains the features for one
        instance.
        """
        return X.dot(self.w)

    def predict(self, X):
        """
        Predicts the outputs for the inputs X. The inputs are assumed to be
        stored in a matrix, where each row contains the features for one
        instance.
        """

        # First compute the output scores
        scores = self.decision_function(X)

        # Select the positive or negative class label, depending on whether
        # the score was positive or negative.
        out = np.select([scores >= 0.0, scores < 0.0],
                        [self.positive_class,
                         self.negative_class])
        return out

    def find_classes(self, Y):
        """
        Finds the set of output classes in the output part Y of the training set.
        If there are exactly two classes, one of them is associated to positive
        classifier scores, the other one to negative scores. If the number of
        classes is not 2, an error is raised.
        """
        classes = sorted(set(Y))
        if len(classes) != 2:
            raise Exception("this does not seem to be a 2-class problem")
        self.positive_class = classes[1]
        self.negative_class = classes[0]

    def encode_outputs(self, Y):
        """
        A helper function that converts all outputs to +1 or -1.
        """
        return np.array([1 if y == self.positive_class else -1 for y in Y])

    
class LG(LinearClassifier):
    """
    A straightforward implementation of the Logistic Regression learning algorithm.
    """

    def __init__(self, n_iter=20):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter

    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        self.w = np.zeros(n_features)
        
        lambda_ = 1 / len(Y)
        t = 1

        # Perceptron algorithm:
        for i in range(self.n_iter):
            
            lf = 0
            for x, y in zip(X, Ye):
                
                t = t + 1
                eta = 1 / ( lambda_ * t ) 

                # Compute the output score for this instance.
                score = x.dot(self.w)
                lf += max(0,1-y*score)

                self.w = ( 1 -  eta *  lambda_) *  self.w  +  (y / (1 + np.exp(y * (score)) )) * x
                
            store = lf / X.shape[0] + lambda_*(self.w.dot(self.w))/2
            print('Value of Objective Function: ' + str(store))
                
        return X.dot(self.w) 


class SVC(LinearClassifier):
    """
    A straightforward implementation of the Support Vector Classifier learning algorithm.
    """

    def __init__(self, n_iter=20):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter

    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        self.w = np.zeros(n_features)
        
        lambda_ = 1 / len(Y)
        
        t = 1

        # Perceptron algorithm:
        for i in range(self.n_iter):
            
            for x, y in zip(X, Ye):
                
                t = t + 1
                
                eta = 1 / ( lambda_ * t ) 

                # Compute the output score for this instance.
                score = x.dot(self.w)

                # If there was an error, update the weights.
                if y*score <= 1:
                    self.w = ( 1 -  eta *  lambda_) *  self.w       + ( eta * y ) *x           
                else:
                    self.w = ( 1 -  eta *  lambda_) *  self.w 
        return X.dot(self.w)

## Logistic Regression (inc optional task)

In [26]:
if __name__ == '__main__':

    # Set up the preprocessing steps and the classifier.
    pipeline = make_pipeline(
        TfidfVectorizer(),
        SelectKBest(k=1000),
        Normalizer(), 
        LG()
    )

    # Train the classifier.
    t0 = time.time()
    pipeline.fit(Xtrain, Ytrain)
    t1 = time.time()
    print('Training time: {:.2f} sec.'.format(t1-t0))

    # Evaluate on the test set.
    Yguess = pipeline.predict(Xtest)
    print('Accuracy: {:.4f}.'.format(accuracy_score(Ytest, Yguess)))

Value of Objective Function: 0.5229719253968691
Value of Objective Function: 0.45875975661451723
Value of Objective Function: 0.46529796395257167
Value of Objective Function: 0.4795624927168698
Value of Objective Function: 0.49582357804931276
Value of Objective Function: 0.512744459129346
Value of Objective Function: 0.5297684396024034
Value of Objective Function: 0.5466850808421864
Value of Objective Function: 0.5633763284643513
Value of Objective Function: 0.579660469248121
Value of Objective Function: 0.5955811278243239
Value of Objective Function: 0.6111457583059363
Value of Objective Function: 0.6263727151412634
Value of Objective Function: 0.6412650904304197
Value of Objective Function: 0.6558346985205822
Value of Objective Function: 0.6700833963635533
Value of Objective Function: 0.6840206913796444
Value of Objective Function: 0.6976488807155251
Value of Objective Function: 0.7109750323400353
Value of Objective Function: 0.7240085871975569
Training time: 3.00 sec.
Accuracy: 0.82

## Support Vector Classifier

In [27]:
if __name__ == '__main__':

    # Set up the preprocessing steps and the classifier.
    pipeline = make_pipeline(
        TfidfVectorizer(),
        SelectKBest(k=1000),
        Normalizer(), 
        SVC()
    )

    # Train the classifier.
    t0 = time.time()
    pipeline.fit(Xtrain, Ytrain)
    t1 = time.time()
    print('Training time: {:.2f} sec.'.format(t1-t0))

    # Evaluate on the test set.
    Yguess = pipeline.predict(Xtest)
    print('Accuracy: {:.4f}.'.format(accuracy_score(Ytest, Yguess)))

Training time: 1.86 sec.
Accuracy: 0.8326.


# Bonus task 1. Making your code more efficient

## Support Vector Classifier Faster (a part) 

In [30]:
class SVC_faster(LinearClassifier):
    """
    A straightforward implementation of the perceptron learning algorithm.
    """

    def __init__(self, n_iter=20):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter

    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        self.w = np.zeros(n_features)
        
        lambda_ = 1 / len(Y)
        t = 1
        # Perceptron algorithm:
        for i in range(self.n_iter):
            
            for x, y in zip(X, Ye):
                
                t +=  1
                
                eta = 1 / ( lambda_ * t ) 

                # Compute the output score for this instance.
                score =  ddot(x, self.w)

                # If there was an error, update the weights.
                if y*score < 1:
                    self.w = dscal(( 1 -  eta *  lambda_), self.w)  
                    daxpy(x, self.w , a = (eta * y))
                    
                else:
                    self.w = dscal(( 1 -  eta *  lambda_), self.w)
        return X.dot(self.w)


In [31]:
if __name__ == '__main__':

    # Set up the preprocessing steps and the classifier.
    pipeline = make_pipeline(
        TfidfVectorizer(),
        SelectKBest(k=1000),
        Normalizer(),

        # NB that this is our Perceptron, not sklearn.linear_model.Perceptron
        #Perceptron()  
        SVC_faster()
    )

    # Train the classifier.
    t0 = time.time()
    pipeline.fit(Xtrain, Ytrain)
    t1 = time.time()
    print('Training time: {:.2f} sec.'.format(t1-t0))

    # Evaluate on the test set.
    Yguess = pipeline.predict(Xtest)
    print('Accuracy: {:.4f}.'.format(accuracy_score(Ytest, Yguess)))

Training time: 1.43 sec.
Accuracy: 0.8326.


The BLAS operation imporves the speed of the code as we see in the print

## B & C questions on Optional

In [32]:
##### The following part is for the optional task.

### Sparse and dense vectors don't collaborate very well in NumPy/SciPy.
### Here are two utility functions that help us carry out some vector
### operations that we'll need.

def add_sparse_to_dense(x, w, factor):
    """
    Adds a sparse vector x, scaled by some factor, to a dense vector.
    This can be seen as the equivalent of w += factor * x when x is a dense
    vector.
    """
    w[x.indices] += factor * x.data

def sparse_dense_dot(x, w):
    """
    Computes the dot product between a sparse vector x and a dense vector w.
    """
    return np.dot(w[x.indices], x.data)


class SparseSVC(LinearClassifier):
    """
    A straightforward implementation of the perceptron learning algorithm,
    assuming that the input feature matrix X is sparse.
    """

    def __init__(self, n_iter=20):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter

    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.

        Note that this will only work if X is a sparse matrix, such as the
        output of a scikit-learn vectorizer.
        """
        self.find_classes(Y)

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        Ye = self.encode_outputs(Y)

        # Initialize the weight vector to all zeros.
        self.w = np.zeros(X.shape[1])

        # Iteration through sparse matrices can be a bit slow, so we first
        # prepare this list to speed up iteration.
        XY = list(zip(X, Ye))
        
                
        lambda_ = 1 / len(Y)
        t = 1

        for i in range(self.n_iter):
            a = 1

            for x, y in XY:
                
                t+=1
                eta = 1 / (lambda_*t)
                
                # Compute the output score for this instance.
                # (This corresponds to score = x.dot(self.w) above.)
                score = sparse_dense_dot(x, self.w) * a

                # If there was an error, update the weights.
                if y*score < 1:
                    
                    a = (1 - eta * lambda_)* a 
                    add_sparse_to_dense(x, self.w, eta * y / a)
                else:
                    a = (1 - eta * lambda_)* a
                    
            self.w = a * self.w
        
                    

In [33]:
if __name__ == '__main__':

    # Set up the preprocessing steps and the classifier.
    pipeline = make_pipeline(
        TfidfVectorizer(),
        #SelectKBest(k=1000),
        Normalizer(),

        # NB that this is our Perceptron, not sklearn.linear_model.Perceptron
        SparseSVC()
    )

    # Train the classifier.
    t0 = time.time()
    pipeline.fit(Xtrain, Ytrain)
    t1 = time.time()
    print('Training time: {:.2f} sec.'.format(t1-t0))

    # Evaluate on the test set.
    Yguess = pipeline.predict(Xtest)
    print('Accuracy: {:.4f}.'.format(accuracy_score(Ytest, Yguess)))

Training time: 2.67 sec.
Accuracy: 0.8410.


If we run SparseSVC without SelectKBest we see the result abouve

In [34]:
if __name__ == '__main__':

    # Set up the preprocessing steps and the classifier.
    pipeline = make_pipeline(
        TfidfVectorizer(),
        #SelectKBest(k=1000),
        Normalizer(),

        # NB that this is our Perceptron, not sklearn.linear_model.Perceptron
        SVC()
    )

    # Train the classifier.
    t0 = time.time()
    pipeline.fit(Xtrain, Ytrain)
    t1 = time.time()
    print('Training time: {:.2f} sec.'.format(t1-t0))

    # Evaluate on the test set.
    Yguess = pipeline.predict(Xtest)
    print('Accuracy: {:.4f}.'.format(accuracy_score(Ytest, Yguess)))

Training time: 11.87 sec.
Accuracy: 0.8410.


With the initall SVC without SelectKBest the runing time is significantly longer