Let's look at the latest database of Gamma Ray Bursts.

 - The database can be downloaded at https://user-web.icecube.wisc.edu/~grbweb_public/Summary_table.txt
 - You can find the physical meaning of each variable at https://user-web.icecube.wisc.edu/~grbweb_public/Variables.html 


This edition of "get your hands dirty" is very open ended (we're getting closer and closer to real research here!). You have a cool dataset, explore it! Play with the data, apply some of the tecniques we have seen in classes so far, etc. **Be creative! You're discovering**


Some relevant physical questions you might want to tackle include:

- Does the distribution contain different sub-populations? How many?
- What's the threshold between the classes?
- If you try two clustering methods, do you get more or less the same?
- How do methods respond to outliers?
- What variable(s) shows the multi-modality more evidently?
- Are all GRBs equally likely to be observed? 

In [None]:
import requests
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

plt.rcParams['figure.figsize'] = [8,8]
plt.rcParams['font.size'] = 12

# Download file
r = requests.get('https://user-web.icecube.wisc.edu/~grbweb_public/Summary_table.txt')
with open("Summary_table.txt", 'wb') as f:
    f.write(r.content)

# Read content
raw_data = np.loadtxt("Summary_table.txt", dtype='str', unpack='True')

# Read headers
with open("Summary_table.txt",'r') as f:
    names= np.array([n.strip().replace(" ","_") for n in f.readlines()[1].replace("#","").replace("\n","").lstrip().split('    ') if n.strip()!=''])

print(names)

In [None]:
import pandas as pd

raw_df = pd.DataFrame(raw_data.T, columns=names)
raw_df.head()

In [None]:
"""
def drop_errors(df, col):
    idx = df[ df[col] == '-999' ].index 
    #print(idx)
    df.drop(idx, inplace=True)
    df = df.reset_index(drop=True)
    return df
"""
def drop_errors(df, cols):
    for col in cols:
        print(col)
        idx = df[ df[col] == '-999' ].index
        df.drop(idx, inplace=True)
        df.reset_index(drop=True)
    return df

- Does the distribution contain different sub-populations? How many?
- What's the threshold between the classes?

In [None]:
#### T90 ANALYSIS

df = drop_errors(raw_df, ['T90'])

T90 = np.log( df['T90'].to_numpy(dtype=float) )

tBins = np.linspace(-4,7,40)
_ = plt.hist(T90, bins=tBins, color='slateblue', density=True, alpha=0.5)
_ = plt.title('T90 distribution of GRBs')
#_ = plt.axvline(2, c='slateblue', ls='-.')


from sklearn.mixture import GaussianMixture

gm = GaussianMixture(n_components=2, random_state=5, covariance_type='spherical').fit(T90.reshape(-1,1))

from scipy.stats import norm

params = np.array( [np.flip(gm.weights_, axis=0),
                    np.flip( np.squeeze( gm.means_), axis=0 ),
                    np.flip( np.sqrt(gm.covariances_), axis=0 ) ] ).T

gauss = lambda x, mu, rms : norm(mu,rms).pdf(x)

x = np.linspace(-6,8,150)
colors = ['navy', 'royalblue']
for [w, mu, rms], color in zip(params[:], colors):
    cut = lambda x : np.around(x,2)
    tag = r'$( \ \mu \ , \sigma \ )=( \ $' + str(cut(mu)) + '$, \ $' + str(cut(rms)) +'$ \ )$'
    plt.plot(x, w*gauss(x,mu,rms), c=color, label=tag, ls='--')
    plt.axvline(mu, c=color, ls=':')
    
plt.legend(ncol=2, loc='center')
plt.figure(figsize=(20,20))

In [None]:
### FLUENCE ANALYSIS

df = drop_errors(raw_df, ['fluence'])
FLUENCE = df['fluence'].to_numpy(dtype=float)

fBins = np.logspace(-9,-2,50)
plt.hist(FLUENCE, bins=fBins)
plt.xscale('log')
#plt.scatter(T90, FLUENCE)
#plt.yscale('symlog')
#plt.ylim([10**-3,10**-2])

- If you try two clustering methods, do you get more or less the same?
- How do methods respond to outliers?
- What variable(s) shows the multi-modality more evidently?
- Are all GRBs equally likely to be observed?

In [None]:
df = drop_errors(raw_df,['T90','fluence'])

T90 = df['T90'].to_numpy(dtype=float)
FLUENCE = df['fluence'].to_numpy(dtype=float)

print('Scattering!')
plt.scatter(T90, FLUENCE, marker='.', s=1.5, color='royalblue')
plt.yscale('log')
plt.xscale('log')
plt.ylim([10**-10,10**-2])
plt.xlim([10**-2,10**3.4])

In [None]:
from sklearn.cluster import KMeans
from sklearn import preprocessing

#print(T90)
#print(FLUENCE)
for t, f in zip(T90,FLUENCE[:3]):
    print([t,f])
data = np.array( [ [t,f] for t,f in zip( T90, FLUENCE ) ] )
data = np.newaxis(data,None)
clf = KMeans(n_clusters = 2)
#clf.fit(data)
# centers = clf.cluster_centers_
# print(centers)