Skip to content

Commit

Permalink
Modularize by creating 3 functions: get_data, preprocess_data, and co…
Browse files Browse the repository at this point in the history
…mpute_distance
  • Loading branch information
csiu committed Apr 22, 2017
1 parent 1cd2c29 commit f176a6a
Showing 1 changed file with 71 additions and 32 deletions.
103 changes: 71 additions & 32 deletions src/python/sim_doc.py
Expand Up @@ -11,56 +11,95 @@
from sklearn.metrics import pairwise_distances


# Get data
dk = custom.DatabaseKick()
cur = dk.connect()

cur.execute("SELECT id, concat_ws(name, blurb) FROM info")
rows = cur.fetchall()
df = pd.DataFrame(rows, columns=["id", "document"])
def get_data():
"""
Output dataframe w/ 2 columns: "id", "document"
"""
# Get data
dk = custom.DatabaseKick()
cur = dk.connect()

dk.disconnect()
cur.execute("SELECT id, concat_ws(name, blurb) FROM info")
rows = cur.fetchall()
df = pd.DataFrame(rows, columns=["id", "document"])

dk.disconnect()

# Preprocess text
def join_output(func):
def func_wrapper(text, *arg, **karg):
return ' '.join(func(text, *arg, **karg))
return func_wrapper
return(df)

text_processing = join_output(custom.text_processing)
def preprocess_data(df):
"""
Preprocess 'document' of dataframe by
- to lowercase
- remove nonletters
- tokenize
- remove stopwords
- stem
Dataframe will contain additional 'doc_processed' column
and df['doc_processed'] will be returned
"""

def doc_to_string(doc):
def join_output(func):
"""
Decorator function to join list output to string
"""
def func_wrapper(text, *arg, **karg):
return ' '.join(func(text, *arg, **karg))
return func_wrapper

def doc_to_string(doc):
"""
Replace None -> empty string, and
text newlines (\n, \r) -> whitespace
"""
if doc == None:
return("")
else:
return(re.sub("[\n\r]", "", doc))

df['document'] = df['document'].apply(
lambda x: doc_to_string(x))

text_processing = join_output(custom.text_processing)
df['doc_processed'] = df['document'].apply(
lambda x: text_processing(x, method="stem"))

return(df['doc_processed'])

def compute_distance(U, i=0, sort=False, top_n=None, metric='euclidean'):
"""
Replace None -> empty string, and
text newlines (\n, \r) -> whitespace
Compute distance of document U[i] with all documents in U
"""
if doc == None:
return("")
else:
return(re.sub("[\n\r]", "", doc))
document0 = np.asmatrix(U[i])

dist = pairwise_distances(document0, U, metric=metric)
df_dist = pd.DataFrame(np.transpose(dist), columns=["dist"])

if sort:
df_dist.sort_values(by="dist", inplace=True)

df['document'] = df['document'].apply(lambda x: doc_to_string(x))
df['doc_processed'] = df['document'].apply(lambda x: text_processing(x, method="stem"))
if top_n != None:
assert type(top_n) is int
df_dist = df_dist.head(top_n)

return(df_dist)


# Get and preprocess data
df = get_data()
_ = preprocess_data(df)

# Make count matrix
cv = CountVectorizer()
X = cv.fit_transform(df['doc_processed'])


# SVD
U, s, Vh = randomized_svd(X, n_components=100, n_iter=5, random_state=5)

# Compute distance and get top results
top_n = compute_distance(U, i=0, sort=True, top_n=5)

# Distance
dist = pairwise_distances(np.asmatrix(U[0]), U, metric='euclidean')
df_dist = pd.DataFrame(np.transpose(dist), columns=["dist"])


# Get top results
top_n = df_dist.sort_values("dist").head()

# Print
results = []
counter = 0
for index, row in df.iloc[top_n.index].iterrows():
Expand Down

0 comments on commit f176a6a

Please sign in to comment.