# Import modules

In [None]:
# Standard libraries
import numpy as np
import pandas as pd
import os
import time

# ML libraries
from sklearn.decomposition import NMF
from sklearn.utils.extmath import randomized_svd

# Set notation of values 
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Define metrics

In [None]:
# To compare both models we need a metric to unify the algorithms. We use RMSE since this approach provides a nice
# meaning in the metric. i.e avg difference between what rating we get vs what we predicted

def calc_rmse(actual, pred):
    """
    Inputs: actual, a numpy array of our observed data
            pred, a numpy array of our predicted data, for NMF it would be W*H
    Output: rmse
    """
    I = actual != 0
    ME = I * (actual - pred)
    MSE = ME**2
    
    return np.sqrt(((pred - actual) ** 2).mean())

In [None]:
# prediction function
# there are two ways to do prediction: 
# 1) look at entries of WH on a specific row 
# 2) np.dot(W[i,:],H[i,:]

# point 2 is for comparing other contracts and recommend clauses from them via nearest neighbours

def predict(X_hat, actual, ref, filter_actual, idx, n_rec):
    """
    Inputs: X_hat : numpy array, in NMF this is WH, in SVD this is USV
            actual: a numpy array, the original X matrix
            ref: a pd.DataFrame object, a DataFrame version of X_hat
            filter_actual: a boolean value, chooses to get rid of actuals
            idx: an integer, contract index
            n_rec: an integer, shows top n_rec rows
            
    Output: pandas DataFrame object, subsetted by idx
    
    Notes: this function relies on ratings_matrix(), and new_dict3 which is the finalised processed data
    new_dict3 = tfidf_summarisation(clauses_list, new_dict2, 10)
    
    """
    # Create a df for reference to obtain actual clauses
    # Idea is to append this to the table so that we preserve clause 
    #df_to_get_clauses = ratings_matrix(new_dict3, to_df = True, transpose = False, fill_val = 0)
    df_toget_clauses = pd.read_pickle(ref)
    clauses = df_to_get_clauses.columns.values
    #  Condition to remove actuals and leave preds
    if filter_actual == True:
        
        diff = X_hat - actual
        diff = np.clip(diff,0,1)
        
    else:
        
        diff = X_hat
        
    print("For contract", idx, "the top", n_rec, "recommended clauses are:")
    
    # return the data set subsetted on row by some idx, and sort descending and show n_rec of them
    return pd.DataFrame(data=diff[idx,:], index = clauses).sort_values(by=0, axis = 0, ascending = False).head(n_rec)

# Loading the Ratings Matrix

In [None]:
file1 = open("ratings_matrix_df.pickle", "rb")
X_df = pickle.load(file1)

file2 = open("ratings_matrix.pickle", "rb")
X = pickle.load(file2)

# NMF

Here we employ NMF: Non-negative matrix factorisation.

Our goal in NMF is to approximate this matrix by the dot product of two arrays $W$ and $H$. 

Dimensions of the arrays are defined by dimensions of $X$ and number of components we set to the algorithm. If $X$ has $n$ contracts/rows and $m$ clauses/columns and we want to decompose it to $k$ clauses/columns, then $W$ has $n$ contracts/rows, and $k$ clauses/rows and $H$ has $k$ clauses/rows and $m$ contracts/columns.

$X$ is our contract-clauses matrix of dimension $n \times m$ i.e contracts = rows, clauses = cols

$W$ is interpreted as if a contract has clause $y$, what is the additional assignment weight to a group or in our case "similar-clauses"

$H$ The higher the weight value the more the clause belonging to a group of "similar-clauses".

Both W,H are initialised as some value - similar to how in NN's weights and biases have an initialisation.

Good example and interpretation: https://medium.com/logicai/non-negative-matrix-factorization-for-recommendation-systems-985ca8d5c16c

In [None]:
# Recall that NMF seeks to break down a matrix X into W and H
# Such that X ≈ W*H

def train_val_NMF_model(data, components, alph, method, verbose):
    """
    Inputs: data: numpy array object, for fit_transform() method
            alph: a list object, range of regularisation parameters
            components: a list object, range of component parameters
            method: a string object, defines what initialisation is needed for NMF training
            verbose: boolean value, turns on verbose on or off
            
    Outputs: errors: a list of frobenius norm of residual matrix between data and the representation(W,H)
             config: a list of configurations used to get the errors
             Ws: a list of W components for each configuration of [components, alph] s.t X ≈ W*H
             Hs: a list of H components for each configuration of [components, alph] s.t X ≈ W*H
    """
    if type(verbose) != bool:
        raise ValueError("'verbose' variable is not boolean-type, use 'True' or 'False' to control verbose")
    else:
        pass
    
    if type(method) != str:
        raise ValueError(" 'method' variable is not string, see init parameters https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html")
    
    start = time.perf_counter()
    errors = []
    config = []
    Ws = []
    Hs = []
    
    print("Initialisation:", method)
    print("Training and validating...")
    for comp in components:
        for alphas in alph:

            NMF_model = NMF(
                        verbose = verbose,
                        n_components = comp,
                        init = method, 
                        solver = 'mu',
                        beta_loss = 'frobenius', # also called Euclidean Norm
                        tol = 1e-4,
                        random_state = 0,
                        alpha = alphas, # testing out no regularisation
                        max_iter = 10000
                       )
    
            W = NMF_model.fit_transform(data)
            H = NMF_model.components_

            error = NMF_model.reconstruction_err_
            
            errors.append(error)
            config.append([comp, alphas])
            Ws.append(W)
            Hs.append(H)
    
    argmin_error = np.argmin(errors)
    best_config = config[argmin_error]
    best_error = errors[argmin_error]
    
    print("Training and validating complete")        
    print("Time elapsed in minutes: ", (time.perf_counter() - start)/60) 
    print("")
    print("Best configuration:", best_config, "with error:", best_error)
    print("Subset W, H at index:", argmin_error)
    print("---------------------------------------")
    return errors, config, Ws, Hs

In [None]:
r_error, r_configs, r_Ws, r_Hs = train_val_NMF_model(X, 
                                                     [], 
                                                     [0.01], 
                                                     'random', 
                                                     False)

## NMF predictions

In [None]:
NMF_Xhat = r_Ws[]@r_Hs[]
pd.DataFrame(NMF_Xhat).head()

In [None]:
predict(NMF_Xhat, X, True, 14, 10) 

# SVD

Core idea of SVD is similar to NMF where we want to express our contract-clauses matrix as a product of matrices in a smaller dimension.

The only difference is the training process and the components we obtain. In NMF we obtain two components $W,H$ and in SVD we obtain three components $U,S,V^T$. SVD components are obtained via linear algebra techniques.

But there is very little interpretability - hard to explain to non-technical people what is going on.

In addition to looking at the entries for predictions, SVD allows approach allows us to project a specific contract into a smaller space and thus compare contracts (via some distance metric) and get recommendations from similar contracts.

In [None]:
# Here we use SVD approach, idea is we want to decompose X = UΣV^(T)

def train_val_SVD_model(data, components):
    """
    Inputs: data: numpy array object, for fit_transform() method
            components: a list object, range of component parameters
            verbose: a boolean value, turns on or off verbose
            
    Outputs: errors: a list of frobenius norm of residual matrix between data and the representation(W,H)
             config: a list of configurations used to get the errors

    """
    start = time.perf_counter()
    config = []
    U_list = []
    S_list = []
    V_t_list = []
    rmse_list = []
    
    print("Performing tSVD...")
    for comp in components:
        print("Number of components:", comp)
        
        U, S_placeholder, V_t = randomized_svd(
                    M = data,
                    n_components = comp,
                    random_state = 0
                    )
        # sklearn returns a list of components, but it should be in a matrix where these values are in diagonal entries
        S = np.ndarray(shape = (S_placeholder.shape[0],S_placeholder.shape[0]))
        np.fill_diagonal(S, S_placeholder)

        # Reconstruction of the data
        data_pred = U@S@V_t

        rmse = calc_rmse(data, data_pred)
        
        print("RMSE:", rmse)
        print("")
        
        rmse_list.append(rmse)
        config.append([comp])
        U_list.append(U)
        S_list.append(S)
        V_t_list.append(V_t)
    
    lowest_rmse_idx = np.argmin(rmse_list)
    
    print("tSVD complete") 
    print("Best # of components to choose is", config[lowest_rmse_idx],",","Subset on index:",lowest_rmse_idx)
    print("Time elapsed in minutes: ", (time.perf_counter() - start)/60) 
    
    return config, U_list, S_list, V_t_list

In [None]:
conf, U, S, V = train_val_SVD_model(X, 
                                    [])

## SVD predictions

In [None]:
SVD_Xhat = U[]@S[]@V[]
pd.DataFrame(SVD_Xhat).head()

In [None]:
predict(SVD_Xhat, X, True, 21, 10) 