From d55acf95d05e2fd65f0371fe788ae8beae45c3ef Mon Sep 17 00:00:00 2001 From: csiu Date: Fri, 21 Apr 2017 22:44:13 -0700 Subject: [PATCH] add if __name__ == '__main__': --- src/python/sim_doc.py | 49 ++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/src/python/sim_doc.py b/src/python/sim_doc.py index 48ea217..c4b1c89 100644 --- a/src/python/sim_doc.py +++ b/src/python/sim_doc.py @@ -85,27 +85,28 @@ def compute_distance(U, i=0, sort=False, top_n=None, metric='euclidean'): return(df_dist) -# Get and preprocess data -df = get_data() -_ = preprocess_data(df) - -# Make count matrix -cv = CountVectorizer() -X = cv.fit_transform(df['doc_processed']) - -# SVD -U, s, Vh = randomized_svd(X, n_components=100, n_iter=5, random_state=5) - -# Compute distance and get top results -top_n = compute_distance(U, i=0, sort=True, top_n=5) - -# Print -results = [] -counter = 0 -for index, row in df.iloc[top_n.index].iterrows(): - row["dist"] = top_n.iloc[counter]["dist"] - results.append(row) - counter += 1 - - print('>> %s | %s' % (row['id'], row['doc_processed']), - row['document'], "\n", sep="\n") +if __name__ == '__main__': + # Get and preprocess data + df = get_data() + _ = preprocess_data(df) + + # Make count matrix + cv = CountVectorizer() + X = cv.fit_transform(df['doc_processed']) + + # SVD + U, s, Vh = randomized_svd(X, n_components=100, n_iter=5, random_state=5) + + # Compute distance and get top results + top_n = compute_distance(U, i=0, sort=True, top_n=5) + + # Print + results = [] + counter = 0 + for index, row in df.iloc[top_n.index].iterrows(): + row["dist"] = top_n.iloc[counter]["dist"] + results.append(row) + counter += 1 + + print('>> %s | %s' % (row['id'], row['doc_processed']), + row['document'], "\n", sep="\n")