# Data migration

## Required imports

Remember to install all packages required inside your working virtual Python environment

In [1]:
import pandas
import sqlalchemy
import json
import numpy
import requests
import re
from copy import deepcopy
from http.cookiejar import CookieJar, MozillaCookieJar
from tqdm.notebook import tqdm
from datetime import date, datetime
from dateutil.relativedelta import relativedelta

## Oracle Client Libraries

To use **cx_Oracle connector**, it is required to install some libraries inside the execution environment: [Oracle Client Library](https://oracle.github.io/odpi/doc/installation.html#oracle-client-library-loading). Please install them and set the required environment variables before running this notebook

## Constants

Custom JSON parser

In [2]:
def parse(obj):
    """JSON serializer for objects not serializable by default json code"""
    if isinstance(obj, (datetime, date)):
        return obj.isoformat()
    if isinstance(obj, set):
        return list(obj)
    raise TypeError ("Type %s not serializable" % type(obj))

Migration campaign reference

In [3]:
DATE_REFERENCE = (datetime.now() - relativedelta(months=2)).date()

Email pattern

In [4]:
EMAIL_REGEX = r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
email_regex = re.compile(EMAIL_REGEX)

Retrieve the email address if this exists inside a string, return empty string otherwise

In [5]:
def sanetize_email(pattern: re.Pattern, email: str) -> str:
    result = pattern.search(email)
    if not result:
        return ""
    return result[0]

Groups identifiers retrieved from: [groups.py](https://github.com/cms-PdmV/ValDB2/blob/main/data/group.py)

In [6]:
uppercase = lambda str_list: [el.strip().upper() for el in str_list]
dict_token = lambda token_list: dict(zip(uppercase(token_list), token_list))

In [7]:
# Tokens for fast parsing
# Categories
CATEGORIES = ["Reconstruction", "HLT", "PAGs", "HIN", "GEN"]
CATEGORIES_TOKENS = dict_token(CATEGORIES)

# Subcategories
SUBCATEGORIES = ["Data", "FastSim", "FullSim", "Gen"]
SUBCATEGORIES_TOKENS = dict_token(SUBCATEGORIES)

# Groups
reconstruction_groups = ['Tracker', 'Ecal', 'HGcal', 'Hcal', 'CASTOR', 'DT', 'CSC', 'RPC', 'GEM',
    'MTD', 'PPS', 'L1', 'Tracking', 'Electron', 'Photon', 'Muon', 'Jet', 'MET', 'bTag', 'Tau',
    'PF'
]
hlt_groups = ['Tracking', 'Electron', 'Photon', 'Muon', 'Jet', 'MET', 'bTag', 'Tau', 'SMP',
    'Higgs', 'Top', 'Susy', 'Exotica', 'B2G', 'B', 'Fwd', 'HIN'
]
pags_groups = ['SMP', 'Higgs', 'Top', 'Susy', 'Exotica', 'B2G', 'B', 'Fwd', 'HIN']
hin_groups = ['Tracking', 'Electron', 'Photon', 'Muon', 'Jet']
gen_groups = ['GEN']

GROUPS = set(reconstruction_groups + hlt_groups + pags_groups + hin_groups + gen_groups)
GROUPS_TOKENS = dict_token(GROUPS)

Be carefull, for some reason it seems **HIN** category has two main identifiers: **HIN** & **IN** -> [Details](https://github.com/cms-PdmV/ValDB/blob/master/ajax_app.py#L489). This is the same case for **TK** group which means **TRACKER**

In [8]:
def parse_group_id(category: str, subcategory: str, status_kind: str = None) -> str:
        # Retrieve proper identifiers using 
        # the available tokens
        # Be aware of some special cases
        if category == "IN":
            category = "HIN"
        if status_kind == "TK":
            status_kind = "TRACKER"

        category = CATEGORIES_TOKENS[category.upper()]
        subcategory = SUBCATEGORIES_TOKENS[subcategory.upper()]
        if status_kind:
            status_kind = GROUPS_TOKENS[status_kind.upper()]
            return ".".join([category, subcategory, status_kind])
        else:
            return ".".join([category, subcategory])

Tokens to parse report status

In [9]:
REPORT_STATUS = {
    "NOT YET DONE": 2,
    "OK": 1,
    "OK TO BE SIGNED-OFF BY THE VALIDATORS": 1,
    "FAILURE": 3,
    "CHANGES EXPECTED": 4,
    "IN PROGRESS": 5,
    "KNOWN ISSUE": 6
}

## Working variables

In [10]:
CREDENTIALS_PATH = "./data/credentials.json"

Import database credentials

In [11]:
with open(file=CREDENTIALS_PATH, encoding="utf-8", mode="r") as cf:
    credentials = json.load(cf)

Import cookies to authenticate to send requests to the server

In [12]:
cookies = MozillaCookieJar(credentials["new"]["cookie_location"])
cookies.load()

Connection URI for SQLAlchemy

In [13]:
prod_db_user = credentials['old']['database_username']
prod_db_password = credentials['old']['database_password']
prod_db = credentials['old']['database_url']
prob_db_service = credentials['old']['database_service']
prod_db_uri = f"oracle+cx_oracle://{prod_db_user}:{prod_db_password}@{prod_db}/?service_name={prob_db_service}"

Create a SQLAlchemy Session

In [14]:
engine = sqlalchemy.create_engine(prod_db_uri, arraysize=1000)

## Retrieve data using Pandas and SQL queries

### Validators

Retrieve all user data related to validators

In [15]:
user_get_query = """
SELECT U.USER_NAME, U.EMAIL, U.ADMIN, U.VALIDATOR, UR.CATEGORY, UR.SUBCATEGORY,	UR.STATUS_KIND
FROM CMS_PDMV_VAL.USERS U, CMS_PDMV_VAL.USER_RIGHTS UR
WHERE U.USER_NAME = UR.USER_NAME AND U.ADMIN = 0 AND U.VALIDATOR = 1
"""

Execute the query using Pandas

In [16]:
user_data: pandas.DataFrame = pandas.read_sql(user_get_query, engine)

Remove all rows where the username has more than 8 characters

In [17]:
user_data = user_data[(user_data["user_name"].str.len() <= 8) & (user_data["user_name"].str.len() > 0)]

Sanetize email information

In [18]:
user_data["email"] = user_data["email"].apply(lambda email: sanetize_email(pattern=email_regex, email=email))

Prepare user insertion schema. For instance

In [19]:
%%capture
{
    "role": 3,
    "email": "example@example.com",
    "fullname": "John Doe",
    "groups": []
}

Group all user categories, subcategories and status kind per user

In [20]:
def parse_validators_schema(user_data: pandas.DataFrame) -> dict:
    validators = {}
    schema = {
        "role": 2,
        "email": "",
        # Sometimes, we could set this by parsing the email.
        # For this moment, just left it empty
        "fullname": None,
        "groups": []
    }
    user_data_json = user_data.to_dict(orient="records")
    for row in user_data_json:        
        current_user_name = row["user_name"]
        current_user = validators.get(current_user_name)
        if not current_user:
            current_user = deepcopy(schema)
            current_user["email"] = row["email"]
        
        # Build group identifier
        category = row["category"]
        subcategory = row["subcategory"]
        status_kind = row["status_kind"]
        
        # Build Group Identifier
        group_id = parse_group_id(
            category=category,
            subcategory=subcategory,
            status_kind=status_kind
        )
        current_user["groups"].append(group_id)
        
        # Persist for next iteration
        validators[current_user_name] = current_user
        
    return validators

In [21]:
user_data_parsed = parse_validators_schema(user_data=user_data)

Insert all validator users

In [22]:
valdb_new_url = credentials["new"]["host_url"]
valdb_users = f"{valdb_new_url}/api/users/"
valdb_validator_mig_errors = {}
for username, info in user_data_parsed.items():    
    response = requests.post(url=valdb_users, json=info, cookies=cookies)
    if response.status_code != 200:
        valdb_validator_mig_errors[username] = {
            "body": info,
            "response": response
        }

Display errors if they happened

In [23]:
if not valdb_validator_mig_errors:
    print("Validator user migration successfully")
else:
    for user, report in valdb_validator_mig_errors.items():
        print(f"Issues migrating user: {user} information")
        print("Request body sent")
        print(report)

Validator user migration successfully


### Administrators

In [24]:
admins_get_query = """
SELECT U.USER_NAME, U.EMAIL, U.ADMIN, U.VALIDATOR
FROM CMS_PDMV_VAL.USERS U
WHERE U.ADMIN = 1
"""

In [25]:
def parse_administrator_schema(user_data: pandas.DataFrame) -> dict:
    admins = {}
    schema = {
        "role": 1,
        "email": "",
        "fullname": None,
        "groups": []
    }
    user_data_json = user_data.to_dict(orient="records")
    for row in user_data_json:        
        current_user_name = row["user_name"]
        current_user = admins.get(current_user_name)
        if not current_user:
            current_user = deepcopy(schema)
            current_user["email"] = row["email"]
            current_user["fullname"] = row["fullname"]
        
        # Persist for next iteration
        admins[current_user_name] = current_user
        
    return admins

In [26]:
admins_data: pandas.DataFrame = pandas.read_sql(admins_get_query, engine)

In [27]:
admins_sanetized_data: pandas.DataFrame = pandas.read_csv(filepath_or_buffer=credentials["new"]["admins_list_path"])

Admin sanetized data has the following attributes

1. user_name: CERN Username
2. email: Primary alias email registered at CERN
3. fullname: User fullname registered at CERN

Fetch current admin logins

In [28]:
admins_data_logins = list(admins_data["user_name"])

Retrieve the active subset

In [29]:
admins_sanetized_data = admins_sanetized_data[admins_sanetized_data["user_name"].isin(admins_data_logins)]

Parse the data

In [30]:
admin_data_parsed = parse_administrator_schema(user_data=admins_sanetized_data)

Insert all admins

In [31]:
valdb_new_url = credentials["new"]["host_url"]
valdb_users = f"{valdb_new_url}/api/users/"
valdb_admins_mig_errors = {}
for username, info in admin_data_parsed.items():    
    response = requests.post(url=valdb_users, json=info, cookies=cookies)
    if response.status_code != 200:
        valdb_admins_mig_errors[username] = {
            "body": info,
            "response": response
        }

Display errors if they happened

In [32]:
if not valdb_admins_mig_errors:
    print("Administrator user migration successfully")
else:
    for user, report in valdb_admins_mig_errors.items():
        print(f"Issues migrating user: {user} information")
        print("Request body sent")
        print(report)

Administrator user migration successfully


### Releases

The data related to the release is splitted between the metadata and its details (status)

In [33]:
releases_query = """
SELECT *
FROM CMS_PDMV_VAL.RELEASES R, CMS_PDMV_VAL.STATUS S
WHERE R.ID = S.ID
"""

In [34]:
releases_data: pandas.DataFrame = pandas.read_sql(releases_query, engine)

Delete rows with unknown status kind

In [35]:
releases_data["status_kind"] = releases_data["status_kind"].str.upper()
releases_data = releases_data[releases_data["status_kind"].isin(GROUPS_TOKENS.keys())]

In [36]:
def parse_releases_schema(releases_data: pandas.DataFrame) -> dict:
    campaigns = {}
    campaign_schema = {
        "name": None,
        "description": "",
        "deadline": None,
        "target_release": None,
        "reference_release": None,
        "relmon": None,
        "subcategories": set(),
        "reports": [],
        "is_open": False        
    }
    report_schema = {
        "authors": [],
        "group": None,
        "campaign_name": None,
        "status": None,
        "content": ""
    }
    releases_data_json = releases_data.to_dict(orient="records")
    for row in releases_data_json:
            current_release_name = str(row["release_name"])
            current_campaign = campaigns.get(current_release_name)
            release_date = row["date"].to_pydatetime().date()
            if not current_campaign:
                current_campaign = deepcopy(campaign_schema)
                current_campaign["name"] = current_release_name
                current_campaign["target_release"] = current_release_name
                current_campaign["reference_release"] = current_release_name
                current_campaign["deadline"] = release_date
                current_campaign["relmon"] = row["relmon_url"]

            # Build the report object
            # Do not take into consideration the authors for the moment
            current_report = deepcopy(report_schema)

            # Build group identifier
            category = row["category"]
            subcategory = row["subcategory"]
            status_kind = row["status_kind"]
            
            # Build Group Identifier
            group_id = parse_group_id(
                category=category,
                subcategory=subcategory,
                status_kind=status_kind
            )

            current_report["group"] = group_id
            current_report["campaign_name"] = current_release_name
            current_report["status"] = REPORT_STATUS[row["validation_status"].strip()]

            comments = row["comments"] if row["comments"] else "No comments provided"
            links = row["links"] if row["links"] else "No links provided"
            links = "\n".join([])
            current_report["content"] = f"""
                Comments: {comments}
                Links: {links}
            """
            # Append the report and its group
            current_campaign["reports"] += [current_report]
            subcategory_id = parse_group_id(
                category=category,
                subcategory=subcategory,
            )
            current_campaign["subcategories"] = current_campaign["subcategories"].union(set([subcategory_id]))
            
            # Final checks:
            # 1. Always set the deadline with the latest available date
            # 2. If the deadline date is after some reference date, set the campaign as open
            if current_campaign["deadline"] < release_date:
                current_campaign["deadline"] = release_date

            if current_campaign["deadline"] > DATE_REFERENCE:
                current_campaign["is_open"] = True

            # Persist for next iteration
            campaigns[current_release_name] = current_campaign
        
    return campaigns

In [37]:
releases_data_parsed = parse_releases_schema(releases_data=releases_data)
releases_data_parsed = json.loads(json.dumps(releases_data_parsed, default=parse))

  releases_data_json = releases_data.to_dict(orient="records")


Insert all releases

In [38]:
valdb_new_url = credentials["new"]["host_url"]
valdb_migration_campaign = f"{valdb_new_url}/api/campaigns/migrate/"
valdb_campaign_mig_errors = {}
execute_migration = False
if execute_migration:
    for username, info in releases_data_parsed.items():    
        response = requests.post(url=valdb_migration_campaign, json=info, cookies=cookies)
        if response.status_code != 200:
            valdb_campaign_mig_errors[username] = {
                "body": info,
                "response": response
            }

if not valdb_campaign_mig_errors:
    print("Releases migrated successfully")
else:
    for release, report in valdb_campaign_mig_errors.items():
        print(f"Issues migrating release: {release} information")
        print("Request body sent")
        print(report["response"].json())
        print(report["body"])

Releases migrated successfully
