In [1]:
import matplotlib.pyplot as plt
from scipy import sparse
import numpy as np
import pickle
from sklearn.decomposition import PCA, TruncatedSVD

In [2]:
def similar(i, Xhat):
    """
    Takes a document and returns the index of
    two documents. The one that is the most similar 
    and the one that is the least similar.
    
    Parameters:
    i index of a document
    Xhat decomposed data
    
    Returns:
    index_min: index of the document most similar to document i
    index_max: index of the document least similar to document i
    """
    ans = np.zeros(len(Xhat))
    xi = Xhat[i]
    norm_xi = np.sqrt(xi@xi)
    for j, xj in enumerate(Xhat):
        ans[j] = xj@xi / (np.sqrt(xj@xj) * norm_xi)

    ans[i] = ans.mean()
    return np.argmax(ans), np.argmin(ans)

In [3]:
def find_similiar(i, X, l=7):
    """
    Uses LSI, applied to the word count matrix X, with the first 7 principal
    components to find the most similar and least similar speeches

    Parameters:
        speech str: Path to speech eg: "./Addresses/1984-Reagan.txt"
        l (int): Number of principal components

    Returns:
        tuple of str: (Most similar speech, least similar speech)
    """    
    #perfrom PCA
    Xhat = TruncatedSVD(n_components=l).fit_transform(X)
    
    #return most similar
    sim, not_sim = similar(i, Xhat)
    return sim, not_sim

In [4]:
X = sparse.load_npz("data-cleaned/recipes_tfidf.npz")
i = 1
sim, not_sim = find_similiar(i, X, l=40)
recipe_comp(i, sim, X)
recipe_info(i, X)
recipe_info(sim, X)
recipe_info(not_sim, X)

NameError: name 'recipe_comp' is not defined