/
sim_doc.py
111 lines (86 loc) · 2.72 KB
/
sim_doc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import sys
sys.path.append("/Users/csiu/repo/kick/src/python")
import custom
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils.extmath import randomized_svd
from sklearn.metrics import pairwise_distances
def get_data():
"""
Output dataframe w/ 2 columns: "id", "document"
"""
# Get data
dk = custom.DatabaseKick()
cur = dk.connect()
cur.execute("SELECT id, concat_ws(name, blurb) FROM info")
rows = cur.fetchall()
df = pd.DataFrame(rows, columns=["id", "document"])
dk.disconnect()
return(df)
def preprocess_data(df):
"""
Preprocess 'document' of dataframe by
- to lowercase
- remove nonletters
- tokenize
- remove stopwords
- stem
Dataframe will contain additional 'doc_processed' column
and df['doc_processed'] will be returned
"""
def join_output(func):
"""
Decorator function to join list output to string
"""
def func_wrapper(text, *arg, **karg):
return ' '.join(func(text, *arg, **karg))
return func_wrapper
def doc_to_string(doc):
"""
Replace None -> empty string, and
text newlines (\n, \r) -> whitespace
"""
if doc == None:
return("")
else:
return(re.sub("[\n\r]", "", doc))
df['document'] = df['document'].apply(
lambda x: doc_to_string(x))
text_processing = join_output(custom.text_processing)
df['doc_processed'] = df['document'].apply(
lambda x: text_processing(x, method="stem"))
return(df['doc_processed'])
def compute_distance(U, i=0, sort=False, top_n=None, metric='euclidean'):
"""
Compute distance of document U[i] with all documents in U
"""
document0 = np.asmatrix(U[i])
dist = pairwise_distances(document0, U, metric=metric)
df_dist = pd.DataFrame(np.transpose(dist), columns=["dist"])
if sort:
df_dist.sort_values(by="dist", inplace=True)
if top_n != None:
assert type(top_n) is int
df_dist = df_dist.head(top_n)
return(df_dist)
# Get and preprocess data
df = get_data()
_ = preprocess_data(df)
# Make count matrix
cv = CountVectorizer()
X = cv.fit_transform(df['doc_processed'])
# SVD
U, s, Vh = randomized_svd(X, n_components=100, n_iter=5, random_state=5)
# Compute distance and get top results
top_n = compute_distance(U, i=0, sort=True, top_n=5)
# Print
results = []
counter = 0
for index, row in df.iloc[top_n.index].iterrows():
row["dist"] = top_n.iloc[counter]["dist"]
results.append(row)
counter += 1
print('>> %s | %s' % (row['id'], row['doc_processed']),
row['document'], "\n", sep="\n")