diff --git a/src/python/sim_doc.py b/src/python/sim_doc.py index 40fefcd..48ea217 100644 --- a/src/python/sim_doc.py +++ b/src/python/sim_doc.py @@ -11,56 +11,95 @@ from sklearn.metrics import pairwise_distances -# Get data -dk = custom.DatabaseKick() -cur = dk.connect() - -cur.execute("SELECT id, concat_ws(name, blurb) FROM info") -rows = cur.fetchall() -df = pd.DataFrame(rows, columns=["id", "document"]) +def get_data(): + """ + Output dataframe w/ 2 columns: "id", "document" + """ + # Get data + dk = custom.DatabaseKick() + cur = dk.connect() -dk.disconnect() + cur.execute("SELECT id, concat_ws(name, blurb) FROM info") + rows = cur.fetchall() + df = pd.DataFrame(rows, columns=["id", "document"]) + dk.disconnect() -# Preprocess text -def join_output(func): - def func_wrapper(text, *arg, **karg): - return ' '.join(func(text, *arg, **karg)) - return func_wrapper + return(df) -text_processing = join_output(custom.text_processing) +def preprocess_data(df): + """ + Preprocess 'document' of dataframe by + - to lowercase + - remove nonletters + - tokenize + - remove stopwords + - stem + Dataframe will contain additional 'doc_processed' column + and df['doc_processed'] will be returned + """ -def doc_to_string(doc): + def join_output(func): + """ + Decorator function to join list output to string + """ + def func_wrapper(text, *arg, **karg): + return ' '.join(func(text, *arg, **karg)) + return func_wrapper + + def doc_to_string(doc): + """ + Replace None -> empty string, and + text newlines (\n, \r) -> whitespace + """ + if doc == None: + return("") + else: + return(re.sub("[\n\r]", "", doc)) + + df['document'] = df['document'].apply( + lambda x: doc_to_string(x)) + + text_processing = join_output(custom.text_processing) + df['doc_processed'] = df['document'].apply( + lambda x: text_processing(x, method="stem")) + + return(df['doc_processed']) + +def compute_distance(U, i=0, sort=False, top_n=None, metric='euclidean'): """ - Replace None -> empty string, and - text newlines (\n, \r) -> whitespace + Compute distance of document U[i] with all documents in U """ - if doc == None: - return("") - else: - return(re.sub("[\n\r]", "", doc)) + document0 = np.asmatrix(U[i]) + + dist = pairwise_distances(document0, U, metric=metric) + df_dist = pd.DataFrame(np.transpose(dist), columns=["dist"]) + + if sort: + df_dist.sort_values(by="dist", inplace=True) -df['document'] = df['document'].apply(lambda x: doc_to_string(x)) -df['doc_processed'] = df['document'].apply(lambda x: text_processing(x, method="stem")) + if top_n != None: + assert type(top_n) is int + df_dist = df_dist.head(top_n) + return(df_dist) + + +# Get and preprocess data +df = get_data() +_ = preprocess_data(df) # Make count matrix cv = CountVectorizer() X = cv.fit_transform(df['doc_processed']) - # SVD U, s, Vh = randomized_svd(X, n_components=100, n_iter=5, random_state=5) +# Compute distance and get top results +top_n = compute_distance(U, i=0, sort=True, top_n=5) -# Distance -dist = pairwise_distances(np.asmatrix(U[0]), U, metric='euclidean') -df_dist = pd.DataFrame(np.transpose(dist), columns=["dist"]) - - -# Get top results -top_n = df_dist.sort_values("dist").head() - +# Print results = [] counter = 0 for index, row in df.iloc[top_n.index].iterrows():