In [1]:
# imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

sns.set_theme()

In [5]:
class Project4():
    """ This class contains all the code I reuse as functions in DAML project 4 to make my Jupyter notebook cleaner.
    
    :param dibos_file: .csv file of diboson kinematics data. 
    :type dibos_file: str
    :param gg_file: .csv file of the ggH1000 kinematics data.
    :type gg_file: str
    :param top_file: .csv file of the top quark kinematics data.
    :type top_file: str
    :param zjet_file: .csv file of the z-jet kinematics data.
    :type dates: str
    """
    
    def __init__(self, dibos_file, gg_file, top_file, zjet_file):
        """ Imports data and compiles it into dataframes. Lists the variables of interest and their units.
        """
        self.dibos = pd.read_csv(dibos_file, index_col=0)
        self.gg = pd.read_csv(gg_file, index_col=0)
        self.top = pd.read_csv(top_file, index_col=0)
        self.zjets = pd.read_csv(zjet_file, index_col=0)
        
        all_data = pd.concat([self.dibos, self.gg, self.top, self.zjets], ignore_index=True)
        all_shuffled = shuffle(all_data, random_state=42)
        all_reset = all_shuffled.reset_index(drop=True)
        self.all = all_reset
        
        self.var_cols = ['lep1_pt', 'lep2_pt', 'fatjet_pt', 'fatjet_eta', 'fatjet_D2', 'Zll_mass', 'Zll_pt', 'MET', 
                         'reco_zv_mass']
        self.cool_cols = ['lep1_pt', 'lep2_pt', 'fatjet_pt', 'fatjet_eta', 'fatjet_D2', 'Zll_mass', 'Zll_pt', 'MET', 
                         'reco_zv_mass', 'isSignal', 'FullEventWeight']
        self.xlabels = ['p_T [MeV/c]', 'p_T [MeV/c]', 'p_T [MeV/c]', 'Eta', 'D2', 'Mass [MeV/c^2]', 'p_T [MeV/c]', 
                        'E_T [MeV]', 'Mass [MeV/c^2]', 'Signal', 'Event_Weight']
    
    def plot_dist(self, df, title='Kinematic Distributions', cool_cols=False):
        """ Plots distributions of a set of columns in the designated dataframe.
        
        :param df: Designates which dataframe's columns are being plotted.
        :type df: dataframe
        :param title: Sets the plot title. Default title is 'Kinematic Distributions'.
        :type title: str, optional
        :param cool_cols: Indicates which variables will be plotted: those contained in the variable column list 
                          'var_cols' or the list of all columns of interest 'cool_cols'. Default is False, resulting
                          in 'var_cols' being used.
        :type cool_cols: bool, optional
        """
        # plots a 3x3 or 3x4 subplot depending on the cool_cols kwarg
        if cool_cols is True:
            dim1 = 4
            num_graphs = 11
        else:
            dim1 = 3
            num_graphs = 9
        
        figure, axes = plt.subplots(dim1, 3, figsize=(10,10))
        figure.suptitle(title)
        
        n = 0
        
        for d2 in np.arange(3):
            for d1 in np.arange(dim1):
                axes[d1][d2].set_title(self.cool_cols[n])
                axes[d1][d2].hist(df[self.cool_cols[n]])
                axes[d1][d2].set_ylabel('Counts')
                axes[d1][d2].set_xlabel(self.xlabels[n])
                n += 1
        
        plt.tight_layout()
        plt.show()
        
    def plot_overlay(self, cool_cols=False):
        """ Plots distributions of a set of columns in the designated dataframe.
        
        :param df: Designates which dataframe's columns are being plotted.
        :type df: dataframe
        :param cool_cols: Indicates which variables will be plotted: those contained in the variable column list 
                          'var_cols' or the list of all columns of interest 'cool_cols'. Default is False, resulting
                          in 'var_cols' being used.
        :type cool_cols: bool, optional
        """
        # plots an overlay plot with all dataframes for each variable
        info = {'lep1_pt': {'bins': [500, 50, 50, 10000], 'xlims': (-10**4, 7*10**5)}, 
                'lep2_pt': {'bins': [100, 50, 50, 500], 'xlims': (-10**4, 4*10**5)}, 
                'fatjet_pt': {'bins': [100, 50, 50, 100], 'xlims': (10**5, 1.2*10**6)}, 
                'fatjet_eta': {'bins': [25, 25, 25, 25], 'xlims': (-2.5, 2.5)}, 
                'fatjet_D2': {'bins': [250, 300, 150, 1000], 'xlims': (-1, 12)}, 
                'Zll_mass': {'bins': [200, 75, 200, 1000], 'xlims': (-10**4, 4*10**5)},
                'Zll_pt': {'bins': [500, 75, 75, 5000], 'xlims': (-10**4, 10**6)}, 
                'MET': {'bins': [1000, 50, 50, 10000], 'xlims': (-10**4, 0.7*10**6)},
                'reco_zv_mass': {'bins': [250, 75, 75, 1000], 'xlims': (0, 2*10**6)}, 
                'isSignal': {'bins': [20, 20, 20, 20], 'xlims': (-2, 2)}, 
                'FullEventWeight': {'bins':[1000, 25, 100, 10000], 'xlims':(-1.5, 2)}
               }
        
        if cool_cols is True:
            n = np.arange(len(self.cool_cols))
            dim1 = 4
        else:
            n = np.arange(len(self.var_cols))
            dim1 = 3
        
        for item in n:
            col = self.cool_cols[item]
            bin_num = info[col]['bins']
            plt.figure()
            plt.title(col)
            plt.ylabel('Counts')
            plt.xlabel(self.xlabels[item])
            plt.xlim(info[col]['xlims'])

            plt.hist(data.dibos[col], bins=bin_num[0], density=True, histtype='step', stacked=True)
            plt.hist(data.gg[col], bins=bin_num[1], density=True, histtype='step', stacked=True)
            plt.hist(data.top[col], bins=bin_num[2], density=True, histtype='step', stacked=True)
            plt.hist(data.zjets[col], bins=bin_num[3], density=True, histtype='step', stacked=True)
            
            plt.legend(['Diboson', 'ggH1000', 'Top', 'Z-jets'])

            plt.show()
    
    def plot_kinematics(self, cool_cols=False):
        """ Plots the kinematic distributions for all variables of interest. Plots both the individual 
            kinematic distributions in a subplot for each process and the overlay plots for all columns
            of interest
        
        :param cool_cols: Determines which set of variables to plot. Default is False, which plots the 
                          set of variables in var_cols.
        :type cool_cols: bool, optional
        """
        processes = [self.dibos, self.gg, self.top, self.zjets]
        
        print('Kinematic Distribution Plots by Process:')
        for process in processes:
            if process is self.dibos:
                title = 'Diboson Data Kinematic Distributions'
            elif process is self.gg:
                title = 'ggH1000 Data Kinematic Distributions'
            elif process is self.top:
                title = 'Top Data Kinematic Distributions'
            elif process is self.zjets:
                title = 'Z-Jet Data Kinematic Distributions'

            self.plot_dist(process, title=title, cool_cols=cool_cols)
            
        print('Overlay Plots:')
        self.plot_overlay(cool_cols=cool_cols)
        
    def apply_cuts(self, df='Default', tight_cut=True):
        """ Applies cuts to the full dataset, background and signal, over all the kinematic variables.
            The cut ranges are determined based on the plots from plot_overlay(). This returns a dataframe
            where datapoints outside of the cut ranges have been removed. 
        """
        # dictionary of cut ranges, based on observation of plot_kinematics
        if df is 'Default':
            df = self.all
            
        if tight_cut is True:
            cuts = {'lep1_pt': (150000, 500000),
                    'lep2_pt': (50000, 250000),
                    'fatjet_pt': (0.3*10**6, 0.6*10**6),
                    'fatjet_eta': (-1, 1),
                    'fatjet_D2': (0, 2.2),
                    'Zll_mass': (75000, 110000),
                    'Zll_pt': (0.3*10**6, 0.8*10**6),
                    'MET': (0, 100000),
                   }
        if tight_cut is False:
            cuts = {'lep1_pt': (100000, 500000),
                    'lep2_pt': (25000, 250000),
                    'fatjet_pt': (0.3*10**6, 0.6*10**6),
                    'fatjet_eta': (-2, 2),
                    'fatjet_D2': (0, 2.2),
                    'Zll_mass': (60000, 110000),
                    'Zll_pt': (0.2*10**6, 0.8*10**6),
                    'MET': (0, 100000),
                   }    
        
        # applying cuts to the full dataset for each kinematic variable        
        cut_data = df
        
        for col in self.var_cols:
            if col is 'reco_zv_mass':
                pass
            else:
                cut_data = cut_data[cut_data[col] > cuts[col][0]]
                cut_data = cut_data[cut_data[col] < cuts[col][1]]
        
        self.cut_df = cut_data
    
    def function(self, var):
        """ Description.
        
        :param var: Describe param var.
        :type var: str
        """
        # start code here
        # include , optional after type if the variable is optional :)
        print(var)