## Panda Data Testing Using MiniBooNE Dataset
<a href="https://pbs.twimg.com/media/C4f3GF6WAAAY1Ok.jpg:large">Pandas cheat sheet for reference</a>
<img src="https://pbs.twimg.com/media/C4f3GF6WAAAY1Ok.jpg:large" >

<br>
<br>
What follows is an example of pandas usage using the MiniBooNE PID dataset from: https://archive.ics.uci.edu/ml/datasets/MiniBooNE+particle+identification


In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.pylab as pylab
import seaborn as sns
sns.set(style='ticks')

#Plot formatting for presentation
plt.style.use(['bmh'])

params = {'legend.fontsize': 'x-large',
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
pylab.rcParams.update(params)

#Load the header information from the text file
#The header contains the number of signal and background events
mb_sigback = pd.read_table("../Datasets/MiniBooNE_PID.txt", delimiter=r"\s+", nrows=1, header=None)
nsig = mb_sigback[0][0]
nback = mb_sigback[1][0]

#Load the signal and background into separate dataframes
mb_sig = pd.read_table("../Datasets/MiniBooNE_PID.txt", delimiter=r"\s+", nrows=1000, skiprows=1, header=None)
mb_back = pd.read_table("../Datasets/MiniBooNE_PID.txt", delimiter=r"\s+", nrows=1000, skiprows=1+nsig, header=None)

#Add labels column to mbsig and mbback (0 = signal, 1 = background)
mb_sig['sigback'] = pd.Series([0 for x in range(len(mb_sig.index))], index=mb_sig.index)
mb_back['sigback'] = pd.Series([1 for x in range(len(mb_back.index))], index=mb_back.index)

#Merge the dataframes into one
mb_all=pd.concat([mb_sig,mb_back]) 

#Clean default values from the dataset (remove entries with -999.00 in any field)
for col in mb_all:
    mb_all=mb_all[mb_all[col]!=-999.00]
    
#Rescale data to be from 0 to 1
norm_cols = [i for i in range(50)]
mb_all[norm_cols] = mb_all[norm_cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [None]:
print("Total signal entries:", nsig)
print("Total background entries:", nback)

In [None]:
mb_sig.describe()

In [None]:
mb_back.describe()

In [None]:
mb_all.describe()

In [None]:
#Do some plotting (first 8 features)
for col in range(8):
    plt.figure()
    #Extract signal and background for this column from pandas dataframe
    sig=mb_all[mb_all.sigback==0][col]
    back=mb_all[mb_all.sigback==1][col]
    #Get binning by merging the datasets, plotting, and returning the binning (index 1)
    bins=np.histogram(np.hstack((sig, back)), bins=40)[1]
    plt.hist(sig, label='Signal', alpha=0.5, bins=bins)
    plt.hist(back, label='Background', alpha=0.5, bins=bins)
    plt.xlabel('Feature %d'%col)
    plt.legend()
    plt.show()

In [None]:
#Plot 2d correlation plots for the first 2 features
sns.jointplot(x=0, y=1, data=mb_all, kind="kde");

In [None]:
#Plot pairwise 2d correlation plots for some features
#Note: this uses the seaborn library
g = sns.pairplot(mb_all, hue="sigback", vars=[0,1,3, 24], height=2.5, plot_kws=dict(s=10))