From 693c5a155887edc3e1f7f02bbe908c6b29286c1a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 17 Apr 2015 04:44:52 +0200 Subject: [PATCH] * Exclude clusterings for words only seen 1 or 2 times, as their clusters are unreliable --- bin/init_model.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bin/init_model.py b/bin/init_model.py index 69e7b9cde6..3e8c08d1b1 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -46,7 +46,10 @@ def _read_clusters(loc): cluster, word, freq = line.split() except ValueError: continue - clusters[word] = cluster + # If the clusterer has only seen the word a few times, its cluster is + # unreliable. + if int(freq) >= 3: + clusters[word] = cluster return clusters