# Training of classifier with TMVA

## import necessary libraries 

In [1]:
import ROOT

Welcome to JupyROOT 6.15/02


In [2]:
from ROOT import gSystem, gROOT, gApplication, TFile, TTree, TCut, TCanvas, TGraph

In [3]:
from ROOT import TMVA
%jsroot on

### Define training variables

In [4]:
InputVar = [
    "max_eta"              ,
    "lep_Pt_1"             ,
    "Mll01"                , 
    "minDeltaR_LJ_0"       ,
    "minDeltaR_LJ_1"       ,
    "MET_RefFinal_et"      ,
    "nJets_OR_T"           ,
    "nJets_OR_T_MV2c10_70" ,
    "lep_flavour"          
]

### Create instance of TMVA factory
see TMVA/macros/TMVAClassification.C for more factory options

 some examples of options for the factory
"!V:ROC:!Correlations:!Silent:Color:!DrawProgressBar:AnalysisType=Classification" ); 



In [5]:
TMVA.Tools.Instance()

<ROOT.TMVA::Tools object at 0x7fa5b1d81f90>

In [6]:
outputFile = TFile.Open("TMVAOutputCV.root", "RECREATE");

In [7]:
factory = TMVA.Factory( "TMVAClassification", outputFile, 
                             "!V:ROC:!Correlations:!Silent:Color:!DrawProgressBar:AnalysisType=Classification")        

In [8]:
dataloader = TMVA.DataLoader("dataset")
# as well name for output folder with relevant plots

Define the input variables that shall be used for the classifier training

In [9]:
InputVarSize = len(InputVar);
#print(InputVarSize)
for ivar in range(len(InputVar)):
    #print(InputVar[i])
    dataloader.AddVariable(InputVar[ivar],'F');

Load signal and background files to training factory

In [10]:
signal = ROOT.TChain("nominal")
signal.Add("../../Files/skimmed/ttH.root")

1

In [None]:
for branch in signal.GetListOfBranches():
    print(branch)

In [11]:
background = ROOT.TChain("nominal")
background.Add("../../Files/skimmed/ttW.root")
#background.GetListOfBranches()

1

In [12]:
signalWeight     = 1.0
backgroundWeight = 1.0

register trees

In [13]:
dataloader.AddSignalTree    ( signal,     signalWeight     )
dataloader.AddBackgroundTree( background, backgroundWeight )

DataSetInfo              : [dataset] : Added class "Signal"
                         : Add Tree nominal of type Signal with 69902 events
DataSetInfo              : [dataset] : Added class "Background"
                         : Add Tree nominal of type Background with 86754 events


In [14]:
#Set individual event weights (the variables must exist in the original TTree)
dataloader.SetSignalWeightExpression( "weightS" );
dataloader.SetBackgroundWeightExpression( "weightS" );

In [15]:
# Apply additional cuts on the signal and background sample. 
mycutSig = TCut( "" ) 
mycutBkg = TCut( "" ) 
# <- keep empty as samples were specifically prepared during slimming: 
# https://github.com/grevtsovkirill/tthml_perf/blob/master/skim/GN2_light/ugly_2lss_sel.cpp#L3810


In [16]:
dataloader.PrepareTrainingAndTestTree( mycutSig, mycutBkg,
                                        "nTrain_Signal=0:nTrain_Background=0:SplitMode=Random:NormMode=NumEvents:V" )

In [17]:
#factory.BookMethod( dataloader, TMVA.Types.kBDT, "BDT",
#                   "!H:!V:NTrees=1000:MinNodeSize=2.5%:BoostType=Grad:Shrinkage=0.10:UseBaggedBoost:BaggedSampleFraction=0.5:nCuts=20:MaxDepth=2" )

setdefBDTprops = "!H:!V:NTrees=1000:MinNodeSize=2.5%:BoostType=Grad:Shrinkage=0.10:UseBaggedBoost:BaggedSampleFraction=0.5:nCuts=20:MaxDepth=2"


 https://root.cern.ch/root/htmldoc/guides/tmva/TMVAUsersGuide.pdf page 22  methods definitions

In [18]:
# Boosted Decision Trees, Gradient Boost from tutorials (https://github.com/root-project/root/blob/master/tutorials/tmva/TMVAClassification.C#L504)
factory.BookMethod( dataloader, TMVA.Types.kBDT, "BDT",setdefBDTprops)
                   #"!H:!V:NTrees=1000:MinNodeSize=2.5%:BoostType=Grad:Shrinkage=0.10:UseBaggedBoost:BaggedSampleFraction=0.5:nCuts=20:MaxDepth=2" )

#agrees to the tth default:
# Method_Opt = "!H:!V:NTrees=1000:MinNodeSize=1.5%:BoostType=Grad:Shrinkage=0.10:
# UseBaggedBoost:BaggedSampleFraction=0.5:nCuts=20:MaxDepth=2";

<ROOT.TMVA::MethodBDT object ("BDT") at 0x7fa5b23c6400>

Factory                  : Booking method: [1mBDT[0m
                         : 
                         : the option NegWeightTreatment=InverseBoostNegWeights does not exist for BoostType=Grad
                         : --> change to new default NegWeightTreatment=Pray
                         : Parsing option string: 
                         : ... "nTrain_Signal=0:nTrain_Background=0:SplitMode=Random:NormMode=NumEvents:V"
                         : The following options are set:
                         : - By User:
                         :     SplitMode: "Random" [Method of picking training and testing events (default: random)]
                         :     NormMode: "NumEvents" [Overall renormalisation of  event-by-event weights used in the training (NumEvents: average weight of 1 per event, independently for signal and background; EqualNumEvents: average weight of 1 per event for signal, and sum of weights for background equal to sum of weights for signal)]
                

In [None]:
#factory.BookMethod( dataloader, TMVA.Types.kSVM, "SVM", "Gamma=1:Tol=0.1:VarTransform=Norm" )
# even gama1, tol 0.1 still too slow
#"Gamma=0.25:Tol=0.001:VarTransform=Norm - do not converge locally - too granular

In [None]:
# low performance 0.620 wrt 0.736 for BDT
#factory.BookMethod( dataloader, TMVA.Types.kRuleFit, "RuleFit",
#                           "H:V:RuleFitModule=RFTMVA:Model=ModRuleLinear:MinImp=0.001:RuleMinDist=0.001:NTrees=20:fEventsMin=0.01:fEventsMax=0.5:GDTau=-1.0:GDTauPrec=0.01:GDStep=0.01:GDNSteps=10000:GDErrScale=1.02" )


In [19]:
   # Train MVAs
factory.TrainAllMethods()

Factory                  : [1mTrain all methods[0m
Factory                  : [dataset] : Create Transformation "I" with events from all classes.
                         : 
                         : Transformation, Variable selection : 
                         : Input : variable 'max_eta' <---> Output : variable 'max_eta'
                         : Input : variable 'lep_Pt_1' <---> Output : variable 'lep_Pt_1'
                         : Input : variable 'Mll01' <---> Output : variable 'Mll01'
                         : Input : variable 'minDeltaR_LJ_0' <---> Output : variable 'minDeltaR_LJ_0'
                         : Input : variable 'minDeltaR_LJ_1' <---> Output : variable 'minDeltaR_LJ_1'
                         : Input : variable 'MET_RefFinal_et' <---> Output : variable 'MET_RefFinal_et'
                         : Input : variable 'nJets_OR_T' <---> Output : variable 'nJets_OR_T'
                         : Input : variable 'nJets_OR_T_MV2c10_70' <---> Output : variable 'nJe

In [20]:
    # Test MVAs
factory.TestAllMethods()
    

Factory                  : [1mTest all methods[0m
Factory                  : Test method: BDT for Classification performance
                         : 
BDT                      : [dataset] : Evaluation of BDT on testing sample (78328 events)
                         : Elapsed time for evaluation of 78328 events: 2.56 sec       


In [21]:
    # Evaluate MVAs
factory.EvaluateAllMethods()    
    


Factory                  : [1mEvaluate all methods[0m
Factory                  : Evaluate classifier: BDT
                         : 
BDT                      : [dataset] : Loop over test events and fill histograms with classifier response...
                         : 
TFHandler_BDT            :             Variable                    Mean                    RMS            [        Min                    Max ]
                         : -----------------------------------------------------------------------------------------------------------------------
                         :              max_eta:                1.2453               0.56708   [             0.0056135                2.4999 ]
                         :             lep_Pt_1:                50414.                29917.   [                20000.            4.5473e+05 ]
                         :                Mll01:            1.4737e+05            1.1223e+05   [                12004.            1.5992e+06 ]
       

In [22]:
    # Save the output.
outputFile.Close()

In [23]:
factory.GetROCCurve(dataloader)

<ROOT.TCanvas object ("ROCCurve dataset class 0") at 0x7fa5b7a3a920>

In [24]:
c2=factory.GetROCCurve(dataloader)
c2.Draw()



In [25]:
factory.GetROCIntegral(dataloader,"BDT")
# applying weights changes AUROC 0.736 to 0.726

0.7265699853415943

In [None]:
#factory.GetROCIntegral(dataloader,"RuleFit")

In [None]:
#factory.GetROCCurve (dataloader,"RuleFit") #

### Variable importance 

In [26]:
#https://swan004.cern.ch/user/kgrevtso/gallery/view/machine_learning/TMVA_VI.ipynb
vi = TMVA.VariableImportance(dataloader)

#vi.BookMethod(TMVA.Types.kBDT, "BDT",
#"!V:NTrees=5:MinNodeSize=2.5%:MaxDepth=2:BoostType=AdaBoost:AdaBoostBeta=0.5:UseBaggedBoost:BaggedSampleFraction=0.5:SeparationType=GiniIndex:nCuts=20" )
vi.BookMethod(TMVA.Types.kBDT, "BDT",setdefBDTprops)

vi.SetType(TMVA.kShort)

vi.Evaluate()

                         : Evaluation done.


In [27]:
vi.GetResults()

<ROOT.TMVA::VariableImportanceResult object at 0x7fa5b2458840>

In [28]:
results=vi.GetResults()
results.Print()
results.Draw()

<ROOT.TCanvas object ("VariableImportance") at 0x7fa5ae2669a0>

                         : Variable Importance Results (Short)
                         : MET_RefFinal_et: 3.16537 % 
                         : Mll01: 15.7079 % 
                         : VariableImportance: 1
                         : lep_Pt_1: 3.13764 % 
                         : lep_flavour: 0.427994 % 
                         : max_eta: 5.17357 % 
                         : minDeltaR_LJ_0: 20.92 % 
                         : minDeltaR_LJ_1: 17.3452 % 
                         : nJets_OR_T: 34.0506 % 
                         : nJets_OR_T_MV2c10_70: 0.0717325 % 


### Perform Cross Validation

In [29]:
#https://root.cern.ch/doc/v614/TMVACrossValidation_8C.html
# adapted to python
# Book boosted decision tree method
CVoutputFile = TFile.Open("cv_test", "RECREATE");
cvOptions ="!V:!Silent:ModelPersistence:AnalysisType=Classification:NumFolds=5"
cv = TMVA.CrossValidation("bdtCV",dataloader,CVoutputFile,cvOptions)

In [30]:
cv.BookMethod(TMVA.Types.kBDT, "BDT",setdefBDTprops)
#        "NTrees=10:MinNodeSize=2.5%:MaxDepth=2:nCuts=20");
# Run cross-validation
#cv.BookMethod(TMVA.Types.kFisher, "Fisher",
#                 "!H:!V:Fisher:VarTransform=None")

In [31]:
cv.Evaluate()

                         : Evaluate method: BDT
<HEADER> Factory                  : Booking method: BDT_fold1
                         : 
                         : the option NegWeightTreatment=InverseBoostNegWeights does not exist for BoostType=Grad
                         : --> change to new default NegWeightTreatment=Pray
<HEADER> BDT_fold1                : #events: (reweighted) sig: 31331 bkg: 31331
                         : #events: (unweighted) sig: 28041 bkg: 34621
                         : Training 1000 Decision Trees ... patience please
                         : Elapsed time for training with 62662 events: 15.6 sec         
<HEADER> BDT_fold1                : [dataset] : Evaluation of BDT_fold1 on training sample (62662 events)
                         : Elapsed time for evaluation of 62662 events: 2.78 sec       
                         : Creating xml weight file: dataset/weights/bdtCV_BDT_fold1.weights.xml
                         : Creating standalone class: dataset/w

In [32]:
# Print results
#resultsCV = TMVA.CrossValidationResult 
resultsCV = cv.GetResults()
#resultsCV[0].Print();

In [33]:
resultsCV

<ROOT.vector<TMVA::CrossValidationResult> object at 0x7fa5ae28a130>

In [35]:
print(cv.GetMethods()[0].GetValue("MethodName"))
#cv.GetMethods()[1].GetValue("MethodName"))

BDT


In [36]:
meth=0
for res_f in resultsCV:
    res_f
    print("mehtod = ",cv.GetMethods()[meth].GetValue("MethodName"),", average= ",res_f.GetROCAverage())
    meth=+1
    for iFold in range(cv.GetNumFolds()):
        print("    Fold ",iFold," auROC= ",res_f.GetROCValues()[iFold],"BkgEff@SigEff=0.3: ",res_f.GetEff30Values()[iFold])


mehtod =  BDT , average=  0.729698657989502
    Fold  0  auROC=  0.726603627204895 BkgEff@SigEff=0.3:  0.621
    Fold  1  auROC=  0.728789746761322 BkgEff@SigEff=0.3:  0.625
    Fold  2  auROC=  0.7322560548782349 BkgEff@SigEff=0.3:  0.6200000000000001
    Fold  3  auROC=  0.7224501967430115 BkgEff@SigEff=0.3:  0.6220000000000001
    Fold  4  auROC=  0.7383938431739807 BkgEff@SigEff=0.3:  0.6439999999999999


In [37]:
resultsCV[0].GetROCAverage()

0.729698657989502

In [45]:
cvg0=resultsCV[0].GetROCCurves()
cvg0

<ROOT.TMultiGraph object at 0x7fa5ad75c920>

In [49]:
CVoutputFile.Close()

In [53]:
resultsCV[0].Draw()

<ROOT.TCanvas object ("CrossValidation") at 0x7fa590c231a0>