diff --git a/src/python/sim_doc.py b/src/python/sim_doc.py
index 40fefcd..48ea217 100644
--- a/src/python/sim_doc.py
+++ b/src/python/sim_doc.py
@@ -11,56 +11,95 @@
 from sklearn.metrics import pairwise_distances
 
 
-# Get data
-dk = custom.DatabaseKick()
-cur = dk.connect()
-
-cur.execute("SELECT id, concat_ws(name, blurb) FROM info")
-rows = cur.fetchall()
-df = pd.DataFrame(rows, columns=["id", "document"])
+def get_data():
+    """
+    Output dataframe w/ 2 columns: "id", "document"
+    """
+    # Get data
+    dk = custom.DatabaseKick()
+    cur = dk.connect()
 
-dk.disconnect()
+    cur.execute("SELECT id, concat_ws(name, blurb) FROM info")
+    rows = cur.fetchall()
+    df = pd.DataFrame(rows, columns=["id", "document"])
 
+    dk.disconnect()
 
-# Preprocess text
-def join_output(func):
-    def func_wrapper(text, *arg, **karg):
-        return ' '.join(func(text, *arg, **karg))
-    return func_wrapper
+    return(df)
 
-text_processing = join_output(custom.text_processing)
+def preprocess_data(df):
+    """
+    Preprocess 'document' of dataframe by
+      - to lowercase
+      - remove nonletters
+      - tokenize
+      - remove stopwords
+      - stem
+    Dataframe will contain additional 'doc_processed' column
+    and df['doc_processed'] will be returned
+    """
 
-def doc_to_string(doc):
+    def join_output(func):
+        """
+        Decorator function to join list output to string
+        """
+        def func_wrapper(text, *arg, **karg):
+            return ' '.join(func(text, *arg, **karg))
+        return func_wrapper
+
+    def doc_to_string(doc):
+        """
+        Replace None -> empty string, and
+        text newlines (\n, \r) -> whitespace
+        """
+        if doc == None:
+            return("")
+        else:
+            return(re.sub("[\n\r]", "", doc))
+
+    df['document'] = df['document'].apply(
+            lambda x: doc_to_string(x))
+
+    text_processing = join_output(custom.text_processing)
+    df['doc_processed'] = df['document'].apply(
+            lambda x: text_processing(x, method="stem"))
+    
+    return(df['doc_processed'])
+
+def compute_distance(U, i=0, sort=False, top_n=None, metric='euclidean'):
     """
-    Replace None -> empty string, and
-    text newlines (\n, \r) -> whitespace
+    Compute distance of document U[i] with all documents in U
     """
-    if doc == None:
-        return("")
-    else:
-        return(re.sub("[\n\r]", "", doc))
+    document0 = np.asmatrix(U[i])
+
+    dist = pairwise_distances(document0, U, metric=metric)
+    df_dist = pd.DataFrame(np.transpose(dist), columns=["dist"])
+
+    if sort:
+         df_dist.sort_values(by="dist", inplace=True)
 
-df['document'] = df['document'].apply(lambda x: doc_to_string(x))
-df['doc_processed'] = df['document'].apply(lambda x: text_processing(x, method="stem"))
+    if top_n != None:
+        assert type(top_n) is int
+        df_dist = df_dist.head(top_n)
 
+    return(df_dist)
+
+
+# Get and preprocess data
+df = get_data()
+_ =  preprocess_data(df)
 
 # Make count matrix
 cv = CountVectorizer()
 X = cv.fit_transform(df['doc_processed'])
 
-
 # SVD
 U, s, Vh = randomized_svd(X, n_components=100, n_iter=5, random_state=5)
 
+# Compute distance and get top results
+top_n = compute_distance(U, i=0, sort=True, top_n=5)
 
-# Distance
-dist = pairwise_distances(np.asmatrix(U[0]), U, metric='euclidean')
-df_dist = pd.DataFrame(np.transpose(dist), columns=["dist"])
-
-
-# Get top results
-top_n = df_dist.sort_values("dist").head()
-
+# Print
 results = []
 counter = 0
 for index, row in df.iloc[top_n.index].iterrows():