# Data migration

## Required imports

Remember to install all packages required inside your working virtual Python environment

In [1]:
import pandas
import sqlalchemy
import json
import numpy
import requests
import re
from copy import deepcopy
from http.cookiejar import CookieJar, MozillaCookieJar
from tqdm.notebook import tqdm

## Oracle Client Libraries

To use **cx_Oracle connector**, it is required to install some libraries inside the execution environment: [Oracle Client Library](https://oracle.github.io/odpi/doc/installation.html#oracle-client-library-loading). Please install them and set the required environment variables before running this notebook

## Constants

Email pattern

In [2]:
EMAIL_REGEX = r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
email_regex = re.compile(EMAIL_REGEX)

Retrieve the email address if this exists inside a string, return empty string otherwise

In [3]:
def sanetize_email(pattern: re.Pattern, email: str) -> str:
    result = pattern.search(email)
    if not result:
        return ""
    return result[0]

Groups identifiers retrieved from: [groups.py](https://github.com/cms-PdmV/ValDB2/blob/main/data/group.py)

In [4]:
uppercase = lambda str_list: [el.strip().upper() for el in str_list]
dict_token = lambda token_list: dict(zip(uppercase(token_list), token_list))

In [5]:
# Tokens for fast parsing
# Categories
CATEGORIES = ["Reconstruction", "HLT", "PAGs", "HIN", "GEN"]
CATEGORIES_TOKENS = dict_token(CATEGORIES)

# Subcategories
SUBCATEGORIES = ["Data", "FastSim", "FullSim", "Gen"]
SUBCATEGORIES_TOKENS = dict_token(SUBCATEGORIES)

# Groups
reconstruction_groups = ['Tracker', 'Ecal', 'HGcal', 'Hcal', 'CASTOR', 'DT', 'CSC', 'RPC', 'GEM',
    'MTD', 'PPS', 'L1', 'Tracking', 'Electron', 'Photon', 'Muon', 'Jet', 'MET', 'bTag', 'Tau',
    'PF'
]
hlt_groups = ['Tracking', 'Electron', 'Photon', 'Muon', 'Jet', 'MET', 'bTag', 'Tau', 'SMP',
    'Higgs', 'Top', 'Susy', 'Exotica', 'B2G', 'B', 'Fwd', 'HIN'
]
pags_groups = ['SMP', 'Higgs', 'Top', 'Susy', 'Exotica', 'B2G', 'B', 'Fwd', 'HIN']
hin_groups = ['Tracking', 'Electron', 'Photon', 'Muon', 'Jet']
gen_groups = ['GEN']

GROUPS = set(reconstruction_groups + hlt_groups + pags_groups + hin_groups + gen_groups)
GROUPS_TOKENS = dict_token(GROUPS)

Be carefull, for some reason it seems **HIN** category has two main identifiers: **HIN** & **IN** -> [Details](https://github.com/cms-PdmV/ValDB/blob/master/ajax_app.py#L489). This is the same case for **TK** group which means **TRACKER**

## Working variables

In [6]:
CREDENTIALS_PATH = "./data/credentials.json"

Import database credentials

In [7]:
with open(file=CREDENTIALS_PATH, encoding="utf-8", mode="r") as cf:
    credentials = json.load(cf)

Import cookies to authenticate to send requests to the server

In [8]:
cookies = MozillaCookieJar(credentials["new"]["cookie_location"])
cookies.load()

Connection URI for SQLAlchemy

In [9]:
prod_db_user = credentials['old']['database_username']
prod_db_password = credentials['old']['database_password']
prod_db = credentials['old']['database_url']
prob_db_service = credentials['old']['database_service']
prod_db_uri = f"oracle+cx_oracle://{prod_db_user}:{prod_db_password}@{prod_db}/?service_name={prob_db_service}"

Create a SQLAlchemy Session

In [10]:
engine = sqlalchemy.create_engine(prod_db_uri, arraysize=1000)

## Retrieve data using Pandas and SQL queries

### Validators

Retrieve all user data related to validators

In [11]:
user_get_query = """
SELECT U.USER_NAME, U.EMAIL, U.ADMIN, U.VALIDATOR, UR.CATEGORY, UR.SUBCATEGORY,	UR.STATUS_KIND
FROM CMS_PDMV_VAL.USERS U, CMS_PDMV_VAL.USER_RIGHTS UR
WHERE U.USER_NAME = UR.USER_NAME AND U.ADMIN = 0 AND U.VALIDATOR = 1
"""

Execute the query using Pandas

In [12]:
user_data: pandas.DataFrame = pandas.read_sql(user_get_query, engine)

Remove all rows where the username has more than 8 characters

In [13]:
user_data = user_data[(user_data["user_name"].str.len() <= 8) & (user_data["user_name"].str.len() > 0)]

Sanetize email information

In [14]:
user_data["email"] = user_data["email"].apply(lambda email: sanetize_email(pattern=email_regex, email=email))

Prepare user insertion schema. For instance

In [15]:
%%capture
{
    "role": 3,
    "email": "example@example.com",
    "fullname": "John Doe",
    "groups": []
}

Group all user categories, subcategories and status kind per user

In [16]:
def parse_validators_schema(user_data: pandas.DataFrame) -> dict:
    validators = {}
    schema = {
        "role": 2,
        "email": "",
        # Sometimes, we could set this by parsing the email.
        # For this moment, just left it empty
        "fullname": None,
        "groups": []
    }
    user_data_json = user_data.to_dict(orient="records")
    for row in user_data_json:        
        current_user_name = row["user_name"]
        current_user = validators.get(current_user_name)
        if not current_user:
            current_user = deepcopy(schema)
            current_user["email"] = row["email"]
        
        # Build group identifier
        category = row["category"]
        subcategory = row["subcategory"]
        status_kind = row["status_kind"]
        
        # Retrieve proper identifiers using 
        # the available tokens
        # Be aware of some special cases
        if category == "IN":
            category = "HIN"
        if status_kind == "TK":
            status_kind = "TRACKER"

        category = CATEGORIES_TOKENS[category.upper()]
        subcategory = SUBCATEGORIES_TOKENS[subcategory.upper()]
        status_kind = GROUPS_TOKENS[status_kind.upper()]
        
        # Build Group Identifier
        group_id = f"{category}.{subcategory}.{status_kind}"
        current_user["groups"].append(group_id)
        
        # Persist for next iteration
        validators[current_user_name] = current_user
        
    return validators

In [17]:
user_data_parsed = parse_validators_schema(user_data=user_data)

Insert all validator users

In [18]:
valdb_new_url = credentials["new"]["host_url"]
valdb_users = f"{valdb_new_url}/api/users/"
valdb_validator_mig_errors = {}
for username, info in user_data_parsed.items():    
    response = requests.post(url=valdb_users, json=info, cookies=cookies)
    if response.status_code != 200:
        valdb_validator_mig_errors[username] = {
            "body": info,
            "response": response
        }

Display errors if they happened

In [19]:
if not valdb_validator_mig_errors:
    print("Validator user migration successfully")
else:
    for user, report in valdb_validator_mig_errors.items():
        print(f"Issues migrating user: {user} information")
        print("Request body sent")
        print(report)

Validator user migration successfully


### Administrators

In [20]:
admins_get_query = """
SELECT U.USER_NAME, U.EMAIL, U.ADMIN, U.VALIDATOR
FROM CMS_PDMV_VAL.USERS U
WHERE U.ADMIN = 1
"""

In [21]:
def parse_administrator_schema(user_data: pandas.DataFrame) -> dict:
    admins = {}
    schema = {
        "role": 1,
        "email": "",
        "fullname": None,
        "groups": []
    }
    user_data_json = user_data.to_dict(orient="records")
    for row in user_data_json:        
        current_user_name = row["user_name"]
        current_user = admins.get(current_user_name)
        if not current_user:
            current_user = deepcopy(schema)
            current_user["email"] = row["email"]
            current_user["fullname"] = row["fullname"]
        
        # Persist for next iteration
        admins[current_user_name] = current_user
        
    return admins

In [22]:
admins_data: pandas.DataFrame = pandas.read_sql(admins_get_query, engine)

In [23]:
admins_sanetized_data: pandas.DataFrame = pandas.read_csv(filepath_or_buffer=credentials["new"]["admins_list_path"])

Admin sanetized data has the following attributes

1. user_name: CERN Username
2. email: Primary alias email registered at CERN
3. fullname: User fullname registered at CERN

Fetch current admin logins

In [24]:
admins_data_logins = list(admins_data["user_name"])

Retrieve the active subset

In [25]:
admins_sanetized_data = admins_sanetized_data[admins_sanetized_data["user_name"].isin(admins_data_logins)]

Parse the data

In [26]:
admin_data_parsed = parse_administrator_schema(user_data=admins_sanetized_data)

Insert all admins

In [27]:
valdb_new_url = credentials["new"]["host_url"]
valdb_users = f"{valdb_new_url}/api/users/"
valdb_admins_mig_errors = {}
for username, info in admin_data_parsed.items():    
    response = requests.post(url=valdb_users, json=info, cookies=cookies)
    if response.status_code != 200:
        valdb_admins_mig_errors[username] = {
            "body": info,
            "response": response
        }

Display errors if they happened

In [28]:
if not valdb_admins_mig_errors:
    print("Administrator user migration successfully")
else:
    for user, report in valdb_admins_mig_errors.items():
        print(f"Issues migrating user: {user} information")
        print("Request body sent")
        print(report)

Administrator user migration successfully
