In [None]:
'''
Pipeline for grid search of hyperparameters
'''


from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import umap
from sklearn import metrics
from sklearn.cluster import DBSCAN

# used to cache results
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory
# print(__doc__)
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)



EPS_OPTIONS = [0.25, 0.5, 0.75]
DB_DIST_OPTIONS = ['l1', 'euclidean', 'cosine']
HDB_DIST_OPTIONS = ['l1', 'euclidean']


pipeline = Pipeline([
        ('vect', CountVectorizer(min_df=3, stop_words='english')),
        ('tfidf', TfidfTransformer()),
        ('reduce_dim', umap.UMAP(n_components=n, metric='cosine')),
        ('clf', DBSCAN()),
],
memory=memory
)


param_grid = [
        {
        'clf': [DBSCAN(min_samples=100, n_jobs=-1)], 
        'clf__eps': EPS_OPTIONS, 
        'clf__metric': DB_DIST_OPTIONS
        },
        {
        'clf': [HDBSCAN(min_cluster_size=100)], 
        'clf__cluster_selection_epsilson': EPS_OPTIONS, 
        'clf__metric': HDB_DIST_OPTIONS
        }
]

grid = GridSearchCV(pipeline, cv=2, n_jobs=-1, param_grid=param_grid, scoring='accuracy')
grid.fit(twenty_train.data, twenty_train.target)

results_df = pd.DataFrame.from_dict(grid.cv_results_)
results_df.to_csv('drive/My Drive/proj2_pipeline.csv')
print(results_df)

rmtree(cachedir)