Skip to content

Commit

Permalink
make it so that can use TFIDF
Browse files Browse the repository at this point in the history
  • Loading branch information
csiu committed Apr 30, 2017
1 parent 30ee2cf commit 5210ebe
Showing 1 changed file with 14 additions and 4 deletions.
18 changes: 14 additions & 4 deletions src/python/sim_doc.py
Expand Up @@ -9,6 +9,7 @@
import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import randomized_svd
from sklearn.metrics import pairwise_distances

Expand All @@ -25,6 +26,10 @@ def get_args():
parser.add_argument('-n', '--num_results', default=None, type=int,
help="Number of similar documents to print in the results")

parser.add_argument('-w', '--term_weight', default="tfidf",
choices=["tfidf", "raw"],
help="How should terms in document be weighted? 'tfidf' or 'raw' counts")

parser.add_argument('-i', '--document0_id', default=None, type=int,
help="Kickstarter ID of query document")

Expand Down Expand Up @@ -123,6 +128,7 @@ def compute_distance(U, i=None, sort=False, top_n=None, metric='euclidean'):
num_results = args.num_results
cache_dir = args.cache_dir
verbose = args.verbose
term_weight = args.term_weight

preprocess_file = os.path.join(os.path.abspath(cache_dir),
"preprocessed.pkl")
Expand All @@ -139,10 +145,14 @@ def compute_distance(U, i=None, sort=False, top_n=None, metric='euclidean'):

df.to_pickle(preprocess_file)


if verbose: print("# Making count matrix...")
cv = CountVectorizer()
X = cv.fit_transform(df['doc_processed'])
if term_weight == "raw":
if verbose: print("# Making count matrix...")
cv = CountVectorizer()
X = cv.fit_transform(df['doc_processed'])
else:
if verbose: print("# Making TF-IDF matrix...")
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['doc_processed'])

if verbose: print("# Computing SVD for %s singular values..." %
num_singular_values)
Expand Down

0 comments on commit 5210ebe

Please sign in to comment.