# Training of classifier with TMVA

## import necessary libraries 

In [1]:
import ROOT

Welcome to JupyROOT 6.15/02


In [2]:
from ROOT import gSystem, gROOT, gApplication, TFile, TTree, TCut, TCanvas, TGraph

In [3]:
from ROOT import TMVA
%jsroot on

### Define training variables

In [4]:
InputVar = [
    "max_eta"              ,
    "lep_Pt_1"             ,
    "Mll01"                , 
    "minDeltaR_LJ_0"       ,
    "minDeltaR_LJ_1"       ,
    "MET_RefFinal_et"      ,
    "nJets_OR_T"           ,
    "nJets_OR_T_MV2c10_70" ,
    "lep_flavour"          
]

### Create instance of TMVA factory
see TMVA/macros/TMVAClassification.C for more factory options

 some examples of options for the factory
"!V:ROC:!Correlations:!Silent:Color:!DrawProgressBar:AnalysisType=Classification" ); 



In [5]:
TMVA.Tools.Instance()

<ROOT.TMVA::Tools object at 0x7f81c1ba9320>

In [6]:
outputFile = TFile.Open("TMVAOutputCV.root", "RECREATE");

In [7]:
factory = TMVA.Factory( "TMVAClassification", outputFile, 
                             "!V:ROC:!Correlations:!Silent:Color:!DrawProgressBar:AnalysisType=Classification")        

In [8]:
dataloader = TMVA.DataLoader("dataset")
# as well name for output folder with relevant plots

Define the input variables that shall be used for the classifier training

In [9]:
InputVarSize = len(InputVar);
#print(InputVarSize)
for ivar in range(len(InputVar)):
    #print(InputVar[i])
    dataloader.AddVariable(InputVar[ivar],'F');

Load signal and background files to training factory

In [10]:
signal = ROOT.TChain("nominal")
signal.Add("../../Files/skimmed/ttH.root")

1

In [11]:
for branch in signal.GetListOfBranches():
    print(branch)

Name: DEtall01 Title: DEtall01/F
Name: lep_flavour Title: lep_flavour/I
Name: max_eta Title: max_eta/F
Name: minDeltaR_LJ_0 Title: minDeltaR_LJ_0/F
Name: minDeltaR_LJ_1 Title: minDeltaR_LJ_1/F
Name: Meff Title: Meff/F
Name: DRlj00 Title: DRlj00/F
Name: min_DRl0b Title: min_DRl0b/F
Name: min_DRlj_new Title: min_DRlj_new/F
Name: DPhij0MET Title: DPhij0MET/F
Name: lead_BjetPt Title: lead_BjetPt/F
Name: lead_jet_sumBEff Title: lead_jet_sumBEff/F
Name: sublead_jet_sumBEff Title: sublead_jet_sumBEff/F
Name: scale_nom Title: scale_nom/F
Name: pileupEventWeight_090 Title: pileupEventWeight_090/D
Name: MV2c10_70_EventWeight Title: MV2c10_70_EventWeight/D
Name: JVT_EventWeight Title: JVT_EventWeight/D
Name: SherpaNJetWeight Title: SherpaNJetWeight/D
Name: EventNumber Title: EventNumber/l
Name: RunYear Title: RunYear/I
Name: MET_RefFinal_et Title: MET_RefFinal_et/F
Name: lep_Pt_0 Title: lep_Pt_0/F
Name: lep_Eta_0 Title: lep_Eta_0/F
Name: lep_Phi_0 Title: lep_Phi_0/F
Name: lep_Pt_1 Title: lep_Pt_1

In [12]:
background = ROOT.TChain("nominal")
background.Add("../../Files/skimmed/ttW.root")
#background.GetListOfBranches()

1

In [13]:
signalWeight     = 1.0
backgroundWeight = 1.0

register trees

In [14]:
dataloader.AddSignalTree    ( signal,     signalWeight     )
dataloader.AddBackgroundTree( background, backgroundWeight )

DataSetInfo              : [dataset] : Added class "Signal"
                         : Add Tree nominal of type Signal with 69902 events
DataSetInfo              : [dataset] : Added class "Background"
                         : Add Tree nominal of type Background with 86754 events


In [15]:
# Apply additional cuts on the signal and background sample. 
mycutSig = TCut( "" ) 
mycutBkg = TCut( "" ) 
# <- keep empty as samples were specifically prepared during slimming: 
# https://github.com/grevtsovkirill/tthml_perf/blob/master/skim/GN2_light/ugly_2lss_sel.cpp#L3810


In [16]:
dataloader.PrepareTrainingAndTestTree( mycutSig, mycutBkg,
                                        "nTrain_Signal=0:nTrain_Background=0:SplitMode=Random:NormMode=NumEvents:V" )

 https://root.cern.ch/root/htmldoc/guides/tmva/TMVAUsersGuide.pdf page 22  methods definitions

In [17]:
# Boosted Decision Trees, Gradient Boost from tutorials (https://github.com/root-project/root/blob/master/tutorials/tmva/TMVAClassification.C#L504)
factory.BookMethod( dataloader, TMVA.Types.kBDT, "BDT",
                   "!H:!V:NTrees=1000:MinNodeSize=2.5%:BoostType=Grad:Shrinkage=0.10:UseBaggedBoost:BaggedSampleFraction=0.5:nCuts=20:MaxDepth=2" )

#agrees to the tth default:
# Method_Opt = "!H:!V:NTrees=1000:MinNodeSize=1.5%:BoostType=Grad:Shrinkage=0.10:
# UseBaggedBoost:BaggedSampleFraction=0.5:nCuts=20:MaxDepth=2";

<ROOT.TMVA::MethodBDT object ("BDT") at 0x7f81c0cbb600>

Factory                  : Booking method: [1mBDT[0m
                         : 
                         : the option NegWeightTreatment=InverseBoostNegWeights does not exist for BoostType=Grad
                         : --> change to new default NegWeightTreatment=Pray
                         : Parsing option string: 
                         : ... "nTrain_Signal=0:nTrain_Background=0:SplitMode=Random:NormMode=NumEvents:V"
                         : The following options are set:
                         : - By User:
                         :     SplitMode: "Random" [Method of picking training and testing events (default: random)]
                         :     NormMode: "NumEvents" [Overall renormalisation of  event-by-event weights used in the training (NumEvents: average weight of 1 per event, independently for signal and background; EqualNumEvents: average weight of 1 per event for signal, and sum of weights for background equal to sum of weights for signal)]
                

In [18]:
#factory.BookMethod( dataloader, TMVA.Types.kSVM, "SVM", "Gamma=1:Tol=0.1:VarTransform=Norm" )
# even gama1, tol 0.1 still too slow
#"Gamma=0.25:Tol=0.001:VarTransform=Norm - do not converge locally - too granular

In [19]:

factory.BookMethod( dataloader, TMVA.Types.kRuleFit, "RuleFit",
                           "H:V:RuleFitModule=RFTMVA:Model=ModRuleLinear:MinImp=0.001:RuleMinDist=0.001:NTrees=20:fEventsMin=0.01:fEventsMax=0.5:GDTau=-1.0:GDTauPrec=0.01:GDStep=0.01:GDNSteps=10000:GDErrScale=1.02" )


<ROOT.TMVA::MethodRuleFit object ("RuleFit") at 0x7f81c0d3ce00>

Factory                  : Booking method: [1mRuleFit[0m
                         : 
                         : Parsing option string: 
                         : ... "H:V:RuleFitModule=RFTMVA:Model=ModRuleLinear:MinImp=0.001:RuleMinDist=0.001:NTrees=20:fEventsMin=0.01:fEventsMax=0.5:GDTau=-1.0:GDTauPrec=0.01:GDStep=0.01:GDNSteps=10000:GDErrScale=1.02"
                         : The following options are set:
                         : - By User:
                         :     <none>
                         : - Default:
                         :     Boost_num: "0" [Number of times the classifier will be boosted]
                         : Parsing option string: 
                         : ... "H:V:RuleFitModule=RFTMVA:Model=ModRuleLinear:MinImp=0.001:RuleMinDist=0.001:NTrees=20:fEventsMin=0.01:fEventsMax=0.5:GDTau=-1.0:GDTauPrec=0.01:GDStep=0.01:GDNSteps=10000:GDErrScale=1.02"
                         : The following options are set:
                         : - By User:
          

In [20]:
   # Train MVAs
factory.TrainAllMethods()

Factory                  : [1mTrain all methods[0m
Factory                  : [dataset] : Create Transformation "I" with events from all classes.
                         : 
                         : Transformation, Variable selection : 
                         : Input : variable 'max_eta' <---> Output : variable 'max_eta'
                         : Input : variable 'lep_Pt_1' <---> Output : variable 'lep_Pt_1'
                         : Input : variable 'Mll01' <---> Output : variable 'Mll01'
                         : Input : variable 'minDeltaR_LJ_0' <---> Output : variable 'minDeltaR_LJ_0'
                         : Input : variable 'minDeltaR_LJ_1' <---> Output : variable 'minDeltaR_LJ_1'
                         : Input : variable 'MET_RefFinal_et' <---> Output : variable 'MET_RefFinal_et'
                         : Input : variable 'nJets_OR_T' <---> Output : variable 'nJets_OR_T'
                         : Input : variable 'nJets_OR_T_MV2c10_70' <---> Output : variable 'nJe

In [21]:
    # Test MVAs
factory.TestAllMethods()
    

Factory                  : [1mTest all methods[0m
Factory                  : Test method: BDT for Classification performance
                         : 
BDT                      : [dataset] : Evaluation of BDT on testing sample (78328 events)
                         : Elapsed time for evaluation of 78328 events: 2.68 sec       
Factory                  : Test method: RuleFit for Classification performance
                         : 
RuleFit                  : [dataset] : Evaluation of RuleFit on testing sample (78328 events)
                         : Elapsed time for evaluation of 78328 events: 0.0119 sec       


In [22]:
    # Evaluate MVAs
factory.EvaluateAllMethods()    
    


Factory                  : [1mEvaluate all methods[0m
Factory                  : Evaluate classifier: BDT
                         : 
BDT                      : [dataset] : Loop over test events and fill histograms with classifier response...
                         : 
TFHandler_BDT            :             Variable                    Mean                    RMS            [        Min                    Max ]
                         : -----------------------------------------------------------------------------------------------------------------------
                         :              max_eta:                1.1910               0.55734   [             0.0056135                2.4999 ]
                         :             lep_Pt_1:                48583.                28331.   [                20000.            4.5473e+05 ]
                         :                Mll01:            1.3457e+05                99160.   [                12004.            1.5992e+06 ]
       

In [23]:
    # Save the output.
outputFile.Close()

In [24]:
c2=factory.GetROCCurve(dataloader)
c2.Draw()

In [25]:
factory.GetROCIntegral(dataloader,"BDT")

0.7363258433199683

In [26]:
factory.GetROCIntegral(dataloader,"RuleFit")

0.6203813438300209

In [27]:
factory.GetROCCurve (dataloader,"RuleFit") #

<ROOT.TGraph object ("Graph") at 0x7f81c72162c0>