# Training of classifier with TMVA

## import necessary libraries 

In [1]:
import ROOT

Welcome to JupyROOT 6.15/02


In [2]:
from ROOT import gSystem, gROOT, gApplication, TFile, TTree, TCut, TCanvas, TGraph

In [3]:
from ROOT import TMVA
%jsroot on

### Define training variables

In [4]:
InputVar = [
    "max_eta"              ,
    "lep_Pt_1"             ,
    "Mll01"                , 
    "minDeltaR_LJ_0"       ,
    "minDeltaR_LJ_1"       ,
    "MET_RefFinal_et"      ,
    "nJets_OR_T"           ,
    "nJets_OR_T_MV2c10_70" ,
    "lep_flavour"          
]

### Create instance of TMVA factory
see TMVA/macros/TMVAClassification.C for more factory options

 some examples of options for the factory
"!V:ROC:!Correlations:!Silent:Color:!DrawProgressBar:AnalysisType=Classification" ); 



In [5]:
TMVA.Tools.Instance()

<ROOT.TMVA::Tools object at 0x7fb97fd80370>

In [6]:
outputFile = TFile.Open("TMVAOutputCV.root", "RECREATE");

In [7]:
factory = TMVA.Factory( "TMVAClassification", outputFile, 
                             "!V:ROC:!Correlations:!Silent:Color:!DrawProgressBar:AnalysisType=Classification")        

In [8]:
dataloader = TMVA.DataLoader("dataset")
# as well name for output folder with relevant plots

Define the input variables that shall be used for the classifier training

In [9]:
InputVarSize = len(InputVar);
#print(InputVarSize)
for ivar in range(len(InputVar)):
    #print(InputVar[i])
    dataloader.AddVariable(InputVar[ivar],'F');

Load signal and background files to training factory

In [10]:
signal = ROOT.TChain("nominal")
signal.Add("../../Files/skimmed/ttH.root")

1

In [None]:
for branch in signal.GetListOfBranches():
    print(branch)

In [11]:
background = ROOT.TChain("nominal")
background.Add("../../Files/skimmed/ttW.root")
#background.GetListOfBranches()

1

In [12]:
signalWeight     = 1.0
backgroundWeight = 1.0

register trees

In [13]:
dataloader.AddSignalTree    ( signal,     signalWeight     )
dataloader.AddBackgroundTree( background, backgroundWeight )

DataSetInfo              : [dataset] : Added class "Signal"
                         : Add Tree nominal of type Signal with 69902 events
DataSetInfo              : [dataset] : Added class "Background"
                         : Add Tree nominal of type Background with 86754 events


In [14]:
#Set individual event weights (the variables must exist in the original TTree)
dataloader.SetSignalWeightExpression( "weightS" );
dataloader.SetBackgroundWeightExpression( "weightS" );

In [15]:
# Apply additional cuts on the signal and background sample. 
mycutSig = TCut( "" ) 
mycutBkg = TCut( "" ) 
# <- keep empty as samples were specifically prepared during slimming: 
# https://github.com/grevtsovkirill/tthml_perf/blob/master/skim/GN2_light/ugly_2lss_sel.cpp#L3810


In [16]:
dataloader.PrepareTrainingAndTestTree( mycutSig, mycutBkg,
                                        "nTrain_Signal=0:nTrain_Background=0:SplitMode=Random:NormMode=NumEvents:V" )

In [17]:
#factory.BookMethod( dataloader, TMVA.Types.kBDT, "BDT",
#                   "!H:!V:NTrees=1000:MinNodeSize=2.5%:BoostType=Grad:Shrinkage=0.10:UseBaggedBoost:BaggedSampleFraction=0.5:nCuts=20:MaxDepth=2" )

setdefBDTprops = "!H:!V:NTrees=1000:MinNodeSize=2.5%:BoostType=Grad:Shrinkage=0.10:UseBaggedBoost:BaggedSampleFraction=0.5:nCuts=20:MaxDepth=2"


 https://root.cern.ch/root/htmldoc/guides/tmva/TMVAUsersGuide.pdf page 22  methods definitions

In [None]:
# Boosted Decision Trees, Gradient Boost from tutorials (https://github.com/root-project/root/blob/master/tutorials/tmva/TMVAClassification.C#L504)
factory.BookMethod( dataloader, TMVA.Types.kBDT, "BDT",setdefBDTprops)
                   #"!H:!V:NTrees=1000:MinNodeSize=2.5%:BoostType=Grad:Shrinkage=0.10:UseBaggedBoost:BaggedSampleFraction=0.5:nCuts=20:MaxDepth=2" )

#agrees to the tth default:
# Method_Opt = "!H:!V:NTrees=1000:MinNodeSize=1.5%:BoostType=Grad:Shrinkage=0.10:
# UseBaggedBoost:BaggedSampleFraction=0.5:nCuts=20:MaxDepth=2";

In [None]:
#factory.BookMethod( dataloader, TMVA.Types.kSVM, "SVM", "Gamma=1:Tol=0.1:VarTransform=Norm" )
# even gama1, tol 0.1 still too slow
#"Gamma=0.25:Tol=0.001:VarTransform=Norm - do not converge locally - too granular

In [None]:
# low performance 0.620 wrt 0.736 for BDT
#factory.BookMethod( dataloader, TMVA.Types.kRuleFit, "RuleFit",
#                           "H:V:RuleFitModule=RFTMVA:Model=ModRuleLinear:MinImp=0.001:RuleMinDist=0.001:NTrees=20:fEventsMin=0.01:fEventsMax=0.5:GDTau=-1.0:GDTauPrec=0.01:GDStep=0.01:GDNSteps=10000:GDErrScale=1.02" )


In [None]:
   # Train MVAs
factory.TrainAllMethods()

In [None]:
    # Test MVAs
factory.TestAllMethods()
    

In [None]:
    # Evaluate MVAs
factory.EvaluateAllMethods()    
    


In [None]:
    # Save the output.
outputFile.Close()

In [None]:
c2=factory.GetROCCurve(dataloader)
c2.Draw()

In [None]:
factory.GetROCIntegral(dataloader,"BDT")
# applying weights changes AUROC 0.736 to 0.726

In [None]:
#factory.GetROCIntegral(dataloader,"RuleFit")

In [None]:
#factory.GetROCCurve (dataloader,"RuleFit") #

### Variable importance 

In [None]:
#https://swan004.cern.ch/user/kgrevtso/gallery/view/machine_learning/TMVA_VI.ipynb
vi = TMVA.VariableImportance(dataloader)

#vi.BookMethod(TMVA.Types.kBDT, "BDT",
#"!V:NTrees=5:MinNodeSize=2.5%:MaxDepth=2:BoostType=AdaBoost:AdaBoostBeta=0.5:UseBaggedBoost:BaggedSampleFraction=0.5:SeparationType=GiniIndex:nCuts=20" )
vi.BookMethod(TMVA.Types.kBDT, "BDT",setdefBDTprops)

vi.SetType(TMVA.kShort)

vi.Evaluate()

In [None]:
results=vi.GetResults()
results.Print()
results.Draw()

### Perform Cross Validation

In [37]:
#https://root.cern.ch/doc/v614/TMVACrossValidation_8C.html
# adapted to python
# Book boosted decision tree method
CVoutputFile = TFile.Open("cv_test", "RECREATE");
cvOptions ="!V:!Silent:ModelPersistence:AnalysisType=Classification:NumFolds=5"
cv = TMVA.CrossValidation("bdtCV",dataloader,CVoutputFile,cvOptions)

In [40]:
cv.BookMethod(TMVA.Types.kBDT, "BDT",setdefBDTprops)
#        "NTrees=10:MinNodeSize=2.5%:MaxDepth=2:nCuts=20");
# Run cross-validation
cv.BookMethod(TMVA.Types.kFisher, "Fisher",
                 "!H:!V:Fisher:VarTransform=None")
cv.Evaluate()

                         : Evaluate method: BDT
<HEADER> Factory                  : Booking method: BDT_fold1
                         : 
                         : the option NegWeightTreatment=InverseBoostNegWeights does not exist for BoostType=Grad
                         : --> change to new default NegWeightTreatment=Pray
<HEADER> BDT_fold1                : #events: (reweighted) sig: 31331 bkg: 31331
                         : #events: (unweighted) sig: 28038 bkg: 34624
                         : Training 1000 Decision Trees ... patience please
                         : Elapsed time for training with 62662 events: 14.1 sec         
<HEADER> BDT_fold1                : [dataset] : Evaluation of BDT_fold1 on training sample (62662 events)
                         : Elapsed time for evaluation of 62662 events: 2.68 sec       
                         : Creating xml weight file: dataset/weights/bdtCV_BDT_fold1.weights.xml
                         : Creating standalone class: dataset/w

In [41]:
# Print results
#resultsCV = TMVA.CrossValidationResult 
resultsCV = cv.GetResults()
#resultsCV[0].Print();

In [48]:
print(cv.GetMethods()[0].GetValue("MethodName"),
cv.GetMethods()[1].GetValue("MethodName"))

BDT Fisher


In [68]:
meth=0
for res_f in resultsCV:
    res_f
    print("mehtod = ",cv.GetMethods()[meth].GetValue("MethodName"))
    meth=+1
    for iFold in range(cv.GetNumFolds()):
        print("Fold ",iFold," auROC= ",res_f.GetROCValues()[iFold])


mehtod =  BDT
0  auROC=  0.7381182312965393
1  auROC=  0.7308182716369629
2  auROC=  0.7246211767196655
3  auROC=  0.7236545085906982
4  auROC=  0.724936306476593
mehtod =  Fisher
0  auROC=  0.7353289723396301
1  auROC=  0.7230631709098816
2  auROC=  0.7182956337928772
3  auROC=  0.7165041565895081
4  auROC=  0.7236860394477844
