# Load data from Vizier

#### Index<a name="index"></a>
1. [Import packages](#imports)
2. [Load raw data](#loadData)
3. [Create examples folder](#createFolder)
4. [Create and save data](#createData)
    1. [True clusters](#createTrue)
    1. [Non clusters](#createFake)

## 1. Import packages<a name="imports"></a>

In [None]:
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from astroquery.vizier import Vizier

#### Aestetic settings

In [None]:
%matplotlib inline

sns.set(font_scale=1.3, style="ticks")

## 2. Load raw data<a name="loadData"></a>

In [None]:
Vizier.ROW_LIMIT = 3000000
catalogs = Vizier.get_catalogs('J/A+A/618/A93/members')
print(catalogs)  # astroquery.utils.commons.TableList

Since `catalogs` only has 1 table, we are only interested in `catalogs[0]`. We will transform it to a `pandas` Data Frame for ease of handling.

In [None]:
catalog = catalogs[0].to_pandas()
catalog = catalog.applymap(lambda x: x.decode() if isinstance(x, bytes) else x)
catalog.head()

## 3. Create examples folder<a name="createFolder"></a>

In [None]:
if not os.path.exists('examples'):
    os.makedirs('examples')
if not os.path.exists('neg_examples'):
    os.makedirs('neg_examples')

## 4. Create and save data<a name="createData"></a>

First choose if you want to see in the notebook the scatter plots and histograms created.

In [None]:
plot_scatter = True
plot_hist = False

### 4.1. True clusters<a name="createTrue"></a>

In [None]:
clusters = np.unique(catalog['Cluster'])
number_clusters = len(clusters)

true_clusters_2d_hist = []
for i, cluster in enumerate(clusters):
    is_cluster = catalog['Cluster'] == cluster
    data_cluster = catalog[is_cluster]
    
    # Remove stars for which Gaia didn't measure magnitude or colour
    gmag = data_cluster['Gmag']
    bp_rp = data_cluster['BP-RP']
    is_nan = (np.isnan(bp_rp)) | (np.isnan(gmag))
    data_cluster = data_cluster[~is_nan]
    
    gmag = data_cluster['Gmag']
    bp_rp = data_cluster['BP-RP']
    
    if plot_scatter:
        plt.scatter(bp_rp, gmag, marker='.')
        plt.xlabel(r'$B_p - R_p$ (mag)')
        plt.ylabel(r'$G$ (mag)')
        plt.title(f'Cluster {cluster}')
        plt.ylim([18,8])
        plt.show()

    hist, xedges, yedges = np.histogram2d(bp_rp, gmag, bins=20, 
                                          density=True)

    plt.imshow(hist, interpolation='nearest')
    plt.title(f'Cluster {cluster}')
    if i < 10:
        #plt.savefig(f'examples/{cluster}.jpg', bbox_inches='tight')
        pass
    if plot_hist:
        plt.show()
    plt.close()

    hist_linear = hist.reshape((400))
    true_clusters_2d_hist.append(hist_linear)

true_clusters_2d_hist_file = f'true_clusters_2d_hist_{number_clusters}.npy'
#np.save(true_clusters_2d_hist_file, true_clusters_2d_hist)

In [None]:
np.save(true_clusters_2d_hist_file, true_clusters_2d_hist)

Confirm the file was well saved by loading it. If the output of the cell is False, the saving went wrong.

In [None]:
true_clusters_2d_hist_saved = np.load(true_clusters_2d_hist_file, allow_pickle=True)
np.allclose(true_clusters_2d_hist_saved, true_clusters_2d_hist)

In [None]:
print('Number of positive samples: ', len(true_clusters_2d_hist))

### 4.2. Non clusters<a name="createFake"></a>

In [None]:
non_clusters_2d_hist = []

In [None]:
for i in range (0, number_clusters):
    number_stars = int(random.uniform (30, 500)) 
    gmag = np.random.uniform (8, 18, size=number_stars)
    bp_rp = np.random.uniform (0, 5, size=number_stars)
    
    hist, xedges, yedges = np.histogram2d(bp_rp, gmag, bins=20, 
                                          density=True)

    if i <= 10:
        plt.imshow(hist, interpolation='nearest')
        plt.title(f'Non Cluster {i}')
        plt.savefig(f'neg_examples/unif_non_cluster_{i}.jpg', 
                    bbox_inches='tight')
    
    if plot_hist:
        plt.show()
    plt.close()
    
    hist_linear = hist.reshape((400))
    non_clusters_2d_hist.append(hist_linear)
    
non_clusters_2d_hist_file = f'unif_non_clusters_2d_hist_{number_clusters}.npy'
#np.save(non_clusters_2d_hist_file, non_clusters_2d_hist)

Confirm the file was well saved by loading it. If the output of the cell is False, the saving went wrong.

In [None]:
non_clusters_2d_hist_saved = np.load(non_clusters_2d_hist_file, allow_pickle=True)
np.allclose(non_clusters_2d_hist_saved, non_clusters_2d_hist)

Go to [Index](#index)