diff --git a/bin/init_model.py b/bin/init_model.py index 69e7b9cde6..3e8c08d1b1 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -46,7 +46,10 @@ def _read_clusters(loc): cluster, word, freq = line.split() except ValueError: continue - clusters[word] = cluster + # If the clusterer has only seen the word a few times, its cluster is + # unreliable. + if int(freq) >= 3: + clusters[word] = cluster return clusters