In [None]:
from run_nano.inspect_xsdb import *
from run_nano.sample_list import sample_df
from fuzzywuzzy import fuzz
import pprint
import pandas as pd

In [None]:
###
## Get xsec info from xsdb
###

In [None]:
# run, probably in command line to enter pwd
! cern-get-sso-cookie --cert ~/.globus/usercert.pem        \
                      --key  ~/.globus/userkey.pem         \
                      -u https://cms-gen-dev.cern.ch/xsdb/ \
                      -o cookie.txt

In [None]:
#create xsdb class
xsdb = XSDB("cookie.txt")

In [None]:
def get_process_name(row):
    return row.das.split('/')[1]

In [None]:
def get_query(**kwargs):
    return xsdb.get(kwargs)

def get_and_match_row(row):
    results = get_query(DAS=row.das)
    #if exact match, return row
    if len(results)==1: return results[0]
    #find same process
    results = get_query(process_name=row.process_name)
    #return empty dict if no results
    if len(results)==0: return {}
    #if only one return row
    if len(results)==1: return results[0]
    #find best fuzzy fit
    scores = []
    for result in results:
        ratio = fuzz.ratio(result['DAS'], row.das)
        result['ratio'] = ratio
        scores.append(result)
    scores = sorted(scores, key=lambda x: -x['ratio'])
    if len(results)>0: return results[0]



In [None]:
sample_df['process_name'] = sample_df.apply(get_process_name, axis=1)

In [None]:
das_results = pd.DataFrame(sample_df.apply(get_and_match_row, axis=1).tolist())

In [None]:
sample_df = pd.concat([sample_df, das_results], axis=1)

In [None]:
sample_df.to_csv('xsdb_output.csv')

In [None]:
sample_df = pd.read_csv('xsdb_output.csv')

In [None]:
###
## format yml
###

In [None]:
# Make paths to fileglobs

In [None]:
store_path ='/eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab'

In [None]:
import os

In [None]:
def make_path(pp, dir_list):
    '''format path for globs'''
    if len(dir_list) == 1: return pp+"/"+dir_list[0]+"/*/*/*.root"
    elif len(dir_list) == 0: return "NA"  
    else: return pp+"/"+"{multiple}"+"/*/*/*.root"

In [None]:
def make_path_row(row):
    era = row.era
    process_path = "{}/{}/{}".format(store_path, row.era, row.process_name)
    try:
        dirs = os.listdir(process_path)
    except:
        return {"bTagEff": "Not Found", "fileglob": "Not Found"}
    effs = filter(lambda x: "eff" in x, dirs)
    non_effs = filter(lambda x: not "eff" in x, dirs)
    
    #path to btagging eff
    effs_path = make_path(process_path, effs)
    #path to file glob  
    sample_path = make_path(process_path, non_effs)
    return {"bTagEff": effs_path, "fileglob": sample_path}

In [None]:
glob_df = pd.DataFrame(sample_df.apply(make_path_row, axis=1).tolist())

In [None]:
yml_df = pd.concat([sample_df, glob_df], axis=1)

In [None]:
import yaml

In [None]:
for era in [2016, 2017, 2018]:
    tdf = yml_df[yml_df.era==era].drop_duplicates(subset=['process_name'])
    tdf['nevts'] = 0
    tdf['xsec'] = tdf['cross_section']*1000
    tdf.loc[tdf.ismc==0, 'nevts'] = 1
    tdf.loc[tdf.ismc==0, 'xsec'] = 1
    yml_dict = tdf[['bTagEff', 'fileglob', 'ismc', 'name', 'nevts', 'xsec']].to_dict(orient='records')
    with open('yml/samples_{}_{}.yaml'.format(era, "DeepCSV"), 'w') as f:
        f.write(yaml.dump(yml_dict))

In [None]:
for x in sample_df:
    print(x)

In [None]:
pd.options.display.max_colwidth = 1000

In [None]:
print(sample_df[['shower', 'das', 'era', 'MCM']].to_latex())

In [None]:
sample_df[sample_df.type!='data'][['shower', 'das', 'era', 'MCM']]

In [None]:
pd.set_option('display.max_rows', 1000)