In [None]:
# ## ETL preprocessing and loading into database

# %%
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import jenkins
import getpass
from jenkins_lib import *
from robot_lib import *
from jenkins_robot_etl import *
import json
from sqlalchemy import create_engine

In [None]:
# 0. Input parameters

# %%
# Default values
inputs_folder = 'inputs'
outputs_folder = 'etl_outputs'
url_jenkins_server = 'https://osm.etsi.org/jenkins'
input_robot_file = 'output.xml'
database_uri = f'sqlite:///{outputs_folder}/test_executions.db'
table_known_builds = 'builds_info'
table_robot_reports = 'robot_reports'
table_robot_reports_extended = 'robot_reports_extended'
dump_all_as_spreadsheets = False

# %% [markdown]
# Tries to bulk load credentials and other environment variables from .env file:

# %%
# If the '.env' file exists, loads the environment variables
load_dotenv();


# %%
# Retrieves Jenkins credentials from environment, if applicable
username = os.environ.get('JENKINS_USER', None) or input('Username: ')
password = os.environ.get('JENKINS_PASS', None) or getpass.getpass()

# Other environment variables
url_jenkins_server = os.environ.get('URL_JENKINS_SERVER', None) or url_jenkins_server
database_uri = os.environ.get('DATABASE_URI', None) or database_uri
inputs_folder = os.environ.get('INPUTS_FOLDER', None) or inputs_folder
outputs_folder = os.environ.get('OUTPUTS_FOLDER', None) or outputs_folder
input_robot_file = os.environ.get('INPUT_ROBOT_FILE', None) or input_robot_file
table_known_builds = os.environ.get('TABLE_KNOWN_BUILDS', None) or table_known_builds
table_robot_reports = os.environ.get('TABLE_ROBOT_REPORTS', None) or table_robot_reports
table_robot_reports_extended = os.environ.get('TABLE_ROBOT_REPORTS_EXTENDED', None) or table_robot_reports_extended

In [None]:
# 2. Populates the database with all builds from a set of relevant jobs

# %%
job_ids_prefix = 'osm-stage_3-merge/'
job_ids_prefix = os.environ.get('JOB_IDS_PREFIX', None) or job_ids_prefix

job_ids = ['master', 'v17.0', 'v16.0', 'v15.0', 'v14.0']
temp_job_ids = os.environ.get('JOB_IDS', None)

if temp_job_ids:
    job_ids = json.loads(temp_job_ids.replace("'", ""))

job_names = ['Master branch', 'Release SEVENTEEN', 'Release SIXTEEN', 'Release FIFTEEN', 'Release FOURTEEN']
temp_job_names = os.environ.get('JOB_NAMES', None)
if temp_job_names:
    job_names = json.loads(temp_job_names.replace("'", ""))

# relevant_jobs = ['osm-stage_3-merge/' + job_id for job_id in job_ids]
relevant_jobs = [job_ids_prefix + job_id for job_id in job_ids]


# %%
# Connection to the Jenkins server
server = jenkins.Jenkins(
    url_jenkins_server,
    username=username,
    password=password
)
#------------------------------

In [None]:
print(server)

In [None]:
print(database_uri)

In [None]:
# %%
# Database setup
engine = create_engine(database_uri)

In [None]:
print(f"Getting new builds from: {', '.join(relevant_jobs)}")

## Here we replace the original loop by selecting only a job

In [None]:
# for job in relevant_jobs:
#     ingest_update_all_jenkins_job(
#         jenkins_server=server,
#         job_name=job,
#         database_engine=engine,
#         robot_report=os.path.join(
#             inputs_folder,
#             input_robot_file
#         ),
#         table_known_builds=table_known_builds,
#         table_robot_reports=table_robot_reports,
#         table_robot_reports_extended=table_robot_reports_extended
#     )
#     break

In [None]:
job = relevant_jobs[0]

In [None]:
jenkins_server=server
job_name=job
database_engine=engine
robot_report=os.path.join(
    inputs_folder,
    input_robot_file
)
table_known_builds=table_known_builds
table_robot_reports=table_robot_reports
table_robot_reports_extended=table_robot_reports_extended

In [None]:
print(job)

In [None]:
import pandas as pd
import numpy as np
import jenkins
from jenkins_lib import *
from robot_lib import *
from sqlalchemy import create_engine
import warnings

In [None]:
# If there is historical data about former builds of this job, it is retrieved first (otherwise, it should return an empty dataframe):
try:
    with database_engine.connect() as connection:
        df_known_builds = pd.read_sql_table(table_known_builds, con=connection)
except (NameError, ValueError) as e:   # If it does not exist, bootstraps a new dataframe
    df_known_builds = pd.DataFrame(columns=['job', 'build', 'timestamp', 'duration', 'build_result', 'test_result', 'pass_count', 'fail_count'])

In [None]:
# Retrieves from Jenkins a fresh list of builds of the job:
df_builds_of_job = get_all_job_builds(jenkins_server, job_name)

In [None]:
# Compares the fresh list with the historical one and determines which builds we need to add to our database:
known_builds = df_known_builds.loc[df_known_builds.job==job_name, 'build'].tolist()
jenkins_builds = df_builds_of_job.loc[:, 'number'].tolist()
new_builds = np.setdiff1d(jenkins_builds, known_builds)

In [None]:
# Creates a new dataframe and appends it to the original one to book the space to save data afterwards:
df_unknown_builds = pd.DataFrame(columns=['job', 'build', 'timestamp', 'duration', 'build_result', 'test_result', 'pass_count', 'fail_count'])
df_unknown_builds['build'] = new_builds
df_unknown_builds['job'] = job_name
df_unknown_builds['timestamp'] = pd.to_datetime(df_unknown_builds.timestamp)
# df_known_builds = pd.concat([df_known_builds, df_unknown_builds], ignore_index=True)
df_known_builds = pd.concat(
    [
        df.dropna(axis=1, how='all') for df in [df_known_builds, df_unknown_builds]
    ],
    ignore_index=True
)

In [None]:
# Starts with empty dataframes
df_new_build_reports = pd.DataFrame(columns=['job', 'build', 'id', 'name', 'source', 'status', 'starttime', 'endtime', 'pass', 'fail', 'failed_test_id', 'failed_test_name', 'failed_keyword'])
df_new_build_reports_details = pd.DataFrame(columns=['job', 'build', 'suite_id', 'suite_name', 'test_id', 'test_name', 'keyword_name', 'status', 'starttime', 'endtime'])
builds_with_missing_info = df_known_builds.loc[(df_known_builds.job==job_name) & (df_known_builds.build_result.isna()), 'build'].tolist()

In [None]:
for build_number in builds_with_missing_info:
    print(f'Retrieving build {build_number} from "{job_name}"...\t', end='')

    # Shortcut to filter this build and job
    this_build_and_job = (df_known_builds.job==job_name) & (df_known_builds.build==build_number)

    # Retrieves the information about the own build
    build_info = get_build_summary(jenkins_server, job_name, build_number)
    if build_info['result'] is None:
        build_info['result'] = 'FAILURE'
    df_known_builds.loc[this_build_and_job, 'build_result'] = build_info['result']
    print(f"Build: {build_info['result']}\t", end='')
    df_known_builds.loc[this_build_and_job, 'timestamp'] = pd.to_datetime(build_info['timestamp'], unit='ms') # Unit in Jenkins for timestamps
    # timestamp_translated = str(df_known_builds.loc[this_build_and_job, 'timestamp'])
    #timestamp_translated = df_known_builds.loc[this_build_and_job, 'timestamp'].dt.strftime('%Y-%m-%d')
    # print(f"{timestamp_translated}({build_info['timestamp']})\t", end='')
    df_known_builds.loc[this_build_and_job, 'duration'] = build_info['duration']

    # Retrieves the Robot report, if it exists
    try:
        robot_report_contents = get_robot_report(jenkins_server, job_name, build_number)
        with open(robot_report, 'w', encoding='utf-8') as f:
            print(robot_report_contents, file=f)

        print('Report available: ', end='')

        # Retrieves the rows that need to be added the corresponding database table, and appends them
        df_build_report = get_consolidated_results_from_report(robot_report, with_rca=True)
        df_build_report_details = get_detailed_results_from_report(robot_report)
        # df_new_build_reports = pd.concat([df_new_build_reports, df_build_report], ignore_index=True)
        #
        ## Comment if this behaviour is undesired. Then, see code into the `with` clause that follows
        df_new_build_reports = pd.concat(
            [
                df.dropna(axis=1, how='all') for df in [df_new_build_reports, df_build_report]
            ],
            ignore_index=True
        )
        #####################################################################33

        # df_new_build_reports_details = pd.concat([df_new_build_reports_details, df_build_report_details], ignore_index=True)
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore",
                category=FutureWarning,
                message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated*"
            )

            # Use only if this behaviour is desired also for the previous dataframe:
            # ---------------------------------------------------------------------
            # df_new_build_reports = pd.concat(
            #     [df_new_build_reports, df_build_report],
            #     ignore_index=True
            # )

            df_new_build_reports_details = pd.concat(
                [df_new_build_reports_details, df_build_report_details],
                ignore_index=True
            )

        # Adds the build number to the new rows
        ## df_new_build_reports.build.fillna(build_number, inplace=True)
        # df_new_build_reports.loc[:, 'build'] = df_new_build_reports.loc[:, 'build'].fillna(build_number)
        ## Ensures the column exists, even empty
        if 'build' not in df_new_build_reports.columns:
            # df_new_build_reports['build'] = pd.NA
            df_new_build_reports['build'] = np.nan
        ## Fills values accordingly
        df_new_build_reports.loc[:, 'build'] = (
            df_new_build_reports.loc[:, 'build']
            .astype('object')
            .infer_objects(copy=False)
            .fillna(build_number)
        )

        ## df_new_build_reports_details.build.fillna(build_number, inplace=True)
        # df_new_build_reports_details.loc[:, 'build'] = df_new_build_reports_details.loc[:, 'build'].fillna(build_number)
        ## Ensures the column exists, even empty
        if 'build' not in df_new_build_reports_details.columns:
            df_new_build_reports_details['build'] = pd.NA
        ## Fills values accordingly
        df_new_build_reports_details.loc[:, 'build'] = (
            df_new_build_reports_details.loc[:, 'build']
            .astype('object')
            .infer_objects(copy=False)
            .fillna(build_number)
        )

        # Records the number of tests passed vs. failed
        df_known_builds.loc[this_build_and_job, 'pass_count'] = df_build_report['pass'].sum()
        df_known_builds.loc[this_build_and_job, 'fail_count'] = df_build_report['fail'].sum()

        # If any test is different from 'PASS', the whole build is marked as 'FAIL'
        if len(df_build_report.loc[df_build_report.status!='PASS']):
            # Job name will surely match, so there is no need to check it
            df_known_builds.loc[this_build_and_job, 'test_result'] = 'FAIL'
            print('FAIL')
        else:
            # Job name will surely match, so there is no need to check it
            df_known_builds.loc[this_build_and_job, 'test_result'] = 'PASS'
            print('PASS')
    except jenkins.NotFoundException as e:
        # If the Robot report could not be retrieved, it marks it as unavailable
        df_known_builds.loc[this_build_and_job, 'test_result'] = 'UNAVAILABLE'
        print('Report unavailable')

In [None]:
# All new rows should come from the same job

## df_new_build_reports.job.fillna(job_name, inplace=True)
# df_new_build_reports.loc[:, 'job'] = df_new_build_reports.loc[:, 'job'].fillna(job_name)
## Ensures the column exists, even empty
if 'job' not in df_new_build_reports.columns:
    df_new_build_reports['job'] = pd.NA
## Fills values accordingly
df_new_build_reports.loc[:, 'job'] = (
    df_new_build_reports.loc[:, 'job']
    .astype('object')
    .infer_objects(copy=False)
    .fillna(job_name)
)

In [None]:
## df_new_build_reports_details.job.fillna(job_name, inplace=True)
# df_new_build_reports_details.loc[:, 'job'] = df_new_build_reports_details.loc[:, 'job'].fillna(job_name)
## Ensures the column exists, even empty
if 'job' not in df_new_build_reports_details.columns:
    df_new_build_reports_details['job'] = pd.NA
## Fills values accordingly
df_new_build_reports_details.loc[:, 'job'] = (
    df_new_build_reports_details.loc[:, 'job']
    .astype('object')
    .infer_objects(copy=False)
    .fillna(job_name)
)

In [None]:
# Fixes the data types
df_new_build_reports['build'] = df_new_build_reports.build.astype('int')
df_new_build_reports['status'] = df_new_build_reports.status.astype('category')
df_new_build_reports_details['build'] = df_new_build_reports_details.build.astype('int')
df_new_build_reports_details['status'] = df_new_build_reports_details.status.astype('category')

df_known_builds['build_result'] = df_known_builds.build_result.astype('category')
df_known_builds['test_result'] = df_known_builds.test_result.astype('category')
df_known_builds['pass_count'] = df_known_builds.pass_count.astype('float')
df_known_builds['fail_count'] = df_known_builds.fail_count.astype('float')

In [None]:
display(df_known_builds.tail())
print()
display(df_new_build_reports.tail())
print()
display(df_new_build_reports_details.tail())

In [None]:
display(df_known_builds.info())
display(df_new_build_reports.info())
display(df_new_build_reports_details.info())

## New code

In [None]:
from sqlalchemy import text
from sqlalchemy.types import BigInteger, String, Float, DateTime, Integer

In [None]:
# Converts columns `category` to `str` for each DataFrame
## df_known_builds
df_known_builds["build_result"] = df_known_builds["build_result"].astype(str)
df_known_builds["test_result"] = df_known_builds["test_result"].astype(str)
## df_new_build_reports
df_new_build_reports["status"] = df_new_build_reports["status"].astype(str)
## df_new_build_reports_details
df_new_build_reports_details["status"] = df_new_build_reports_details["status"].astype(str)

In [None]:
# If existing, remove `auto_id` column so that MySQL can generate it automatically
if 'auto_id' in df_known_builds.columns:
    df_known_builds = df_known_builds.drop(columns=['auto_id'])

In [None]:
# Dtypes for `builds_info`
dtype_known_builds = {
    "job": String(65535),  # TEXT in MySQL allows up to 65535 bytes
    "build": BigInteger(),
    "timestamp": DateTime(),
    "duration": BigInteger(),
    "build_result": String(65535),
    "test_result": String(65535),
    "pass_count": Float(),
    "fail_count": Float()
}

# Dtypes for `robot_reports`
dtype_robot_reports = {
    "job": String(65535),
    "build": BigInteger(),
    "id": String(65535),
    "name": String(65535),
    "source": String(65535),
    "status": String(65535),
    "starttime": DateTime(),
    "endtime": DateTime(),
    "pass": Integer(),
    "fail": Integer(),
    "failed_test_id": String(65535),
    "failed_test_name": String(65535),
    "failed_keyword": String(65535),
}

# Dtypes for `robot_reports_extended`
dtype_robot_reports_extended = {
    "job": String(65535),
    "build": String(255),  # Seg√∫n tu info es object en df_new_build_reports_details, usar varchar(255)
    "suite_id": String(65535),
    "suite_name": String(65535),
    "test_id": String(65535),
    "test_name": String(65535),
    "keyword_name": String(65535),
    "status": String(65535),
    "starttime": DateTime(),
    "endtime": DateTime(),
}

In [None]:
with database_engine.begin() as conn:
    # Delete and re-create `builds_info` table with the origina schema and `auto_increment`
    conn.execute(text(f"DROP TABLE IF EXISTS {table_known_builds}"))
    conn.execute(text(f"""
        CREATE TABLE {table_known_builds} (
            auto_id BIGINT PRIMARY KEY AUTO_INCREMENT,
            job TEXT,
            build BIGINT,
            timestamp DATETIME,
            duration BIGINT,
            build_result TEXT,
            test_result TEXT,
            pass_count DOUBLE,
            fail_count DOUBLE
        ) ENGINE=InnoDB;
    """))

    # Insert data without `auto_id` columns so that MySQL assigns it
    df_known_builds.to_sql(
        name=table_known_builds,
        con=conn,
        if_exists='append',
        index=False,
        dtype=dtype_known_builds,
        method='multi'
    )

    # For `robot_reports`, with remains, we just insert with `append`
    df_new_build_reports.to_sql(
        name=table_robot_reports,
        con=conn,
        if_exists='append',
        index=False,
        dtype=dtype_robot_reports,
        method='multi'
    )

    # For `robot_reports_extended`, we insert with `append` as well
    df_new_build_reports_details.to_sql(
        name=table_robot_reports_extended,
        con=conn,
        if_exists='append',
        index=False,
        dtype=dtype_robot_reports_extended,
        method='multi'
    )

In [None]:
print("DONE")