# Analysis of Robot reports from OSM Jenkins (Step 1)
 
## ETL preprocessing and loading into database

In [None]:
import os
import pandas as pd
import numpy as np
import jenkins
import getpass
from jenkins_lib import *
from robot_lib import *
from jenkins_robot_etl import *
from sqlalchemy import create_engine

## 0. Input parameters

In [None]:
inputs_folder = 'inputs'
outputs_folder = 'etl_outputs'
url_jenkins_server = 'https://osm.etsi.org/jenkins'
input_robot_file = 'output.xml'
database_uri = f'sqlite:///{outputs_folder}/test_executions.db'
table_known_builds = 'builds_info'
table_robot_reports = 'robot_reports'
table_robot_reports_extended = 'robot_reports_extended'
dump_all_as_spreadsheets = False
#job_name = 'osm-stage_3-merge/v9.0'
job_name = 'osm-stage_3-merge/master'

Credentials:

In [None]:
# If the '.env' file exists, loads the environment variables
try:
    with open('.env', 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            key, value = line.split('=')
            os.environ[key] = value
except FileNotFoundError as e:
    print("Environment file ('.env') does not exist. Skipping...")

In [None]:
username = os.environ.get('JENKINS_USER', None) or input('Username: ')
password = os.environ.get('JENKINS_PASS', None) or getpass.getpass()

## 1. Retrieval of Jenkins jobs info and Robot reports

Opens session with the Jenkins server:

In [None]:
server = jenkins.Jenkins(url_jenkins_server, username=username, password=password)

Tests the connection to the Jenkins server:

In [None]:
test_jenkins_connection(server)

### 1.1 Jobs in the Jenkins server

Retrieves the list of jobs that exist in the Jenkins server:

In [None]:
get_all_jenkins_jobs_as_df(server)

1.2 Analysis of specific jobs: e.g. v9.0 testing job (`job_name`)

In [None]:
my_job_status = get_job_summary(server, job_name)
my_job_status

Health report of the job:

In [None]:
health = get_job_health(server, job_name)
health

### 1.3 Analysis of builds of the reference job (v9.0 testing job)

List of historical builds of the job:

In [None]:
df_builds_of_job = get_all_job_builds(server, job_name)
df_builds_of_job

In [None]:
print(df_builds_of_job.number.tolist())

Retrieves all the information about a specific build:

In [None]:
# We want the info of latest complete build
build_number = my_job_status["lastCompletedBuild_number"]
#build_number = 985

my_build_summary = get_build_summary(server, job_name, build_number)
my_build_summary

 ### 1.4 Retrieval of Robot results of latest completed build of v9.0 testing job

In [None]:
try:
    robot_report_contents = get_robot_report(server, job_name, build_number)
#except requests.HTTPError:
except jenkins.NotFoundException:
    print(f'Build {build_number} in job {job_name} did not issue any Robot report.')

## 2. Imports info from Robot test report and cleans data

In [None]:
robot_report = os.path.join(inputs_folder, input_robot_file)
with open(robot_report, 'w', encoding='utf-8') as f:
    print(robot_report_contents, file=f)

### 2.1 Numerical statistics


In [None]:
df_test_stats = get_stats_from_report(robot_report)
df_test_stats

In [None]:
df_test_stats.info()

### 2.2 Results per test suite

In [None]:
df_test_suites = get_results_from_report(robot_report)
df_test_suites

In [None]:
df_test_suites.info()

### 2.3 Stats and results per test suite (consolidated)

In [None]:
df_consolidated_test_results = get_consolidated_results_from_report(robot_report)
df_consolidated_test_results

### 2.4 Details of the test suites up to the level of keyword

In [None]:
df_tests_and_keywords = get_detailed_results_from_report(robot_report)
df_tests_and_keywords

In [None]:
df_tests_and_keywords.info()

Finds the first failure per test suite (which is the most likely root cause):

In [None]:
df_root_cause_errors = df_tests_and_keywords.loc[df_tests_and_keywords.status=='FAIL'].groupby('suite_id').first()
df_root_cause_errors

In [None]:
df_root_cause_errors.info()

### 2.5 Enriches consolidated results with likely root cause of failures

In [None]:
consolidated_results_from_report = get_consolidated_results_from_report(robot_report, with_rca=True)
consolidated_results_from_report

In [None]:
consolidated_results_from_report.info()

## 3. Populates a database with data from all builds of a job

In [None]:
#job_name = 'osm-stage_3-merge/v9.0'
#job_name = 'osm-stage_3-merge/master'

Database connection setup:

In [None]:
# Database setup
engine = create_engine(database_uri)

If there is historical data about former builds of this job, it is retrieved first (otherwise, it should return an empty dataframe):

In [None]:
try:
    with engine.connect() as connection:
        df_known_builds = pd.read_sql_table(table_known_builds, con=connection)

    # Fixes data types
    #df_known_builds['duration'] = pd.to_timedelta(df_known_builds.duration, unit='ns')  # 'ns' is the unit in SQLAlchemy
except (NameError, ValueError) as e:   # If it does not exist, bootstraps a new dataframe
    df_known_builds = pd.DataFrame(columns=['job', 'build', 'timestamp', 'duration', 'build_result', 'test_result', 'pass_count', 'fail_count'])
df_known_builds

In [None]:
df_known_builds.info()

In [None]:
# Check that `duration` is convertible to `timedelta` format whenever needed:

pd.to_timedelta(df_known_builds.duration.astype(float), unit='ms')  # 'ms' is the unit in Jenkins
#pd.to_timedelta(df_known_builds.duration, unit='ns')  # 'ns' is the unit in SQLAlchemy

Retrieves from Jenkins a fresh list of builds of the job:

In [None]:
df_builds_of_job = get_all_job_builds(server, job_name)
df_builds_of_job

Compares the fresh list with the historical one and determines which builds we need to add to our database:

In [None]:
known_builds = df_known_builds.loc[df_known_builds.job==job_name, 'build'].tolist()
jenkins_builds = df_builds_of_job.loc[:, 'number'].tolist()
new_builds = np.setdiff1d(jenkins_builds, known_builds)
new_builds

Creates a new dataframe and appends it to the original one to book the space to save data afterwards:

In [None]:
df_unknown_builds = pd.DataFrame(columns=['job', 'build', 'timestamp', 'duration', 'build_result', 'test_result', 'pass_count', 'fail_count'])
df_unknown_builds['build'] = new_builds
df_unknown_builds['job'] = job_name
df_unknown_builds['timestamp'] = pd.to_datetime(df_unknown_builds.timestamp)
df_unknown_builds

In [None]:
df_known_builds = pd.concat([df_known_builds, df_unknown_builds], ignore_index=True)
df_known_builds

In [None]:
df_known_builds.info()

Iterates to retrieve all the information from unknown builds and, if feasible, their corresponding Robot reports:

In [None]:
robot_report = os.path.join(inputs_folder, input_robot_file)

# Starts with empty dataframes
df_new_build_reports = pd.DataFrame(columns=['job', 'build', 'id', 'name', 'source', 'status', 'starttime', 'endtime', 'pass', 'fail', 'failed_test_id', 'failed_test_name', 'failed_keyword'])
df_new_build_reports_details = pd.DataFrame(columns=['job', 'build', 'suite_id', 'suite_name', 'test_id', 'test_name', 'keyword_name', 'status', 'starttime', 'endtime'])

builds_with_missing_info = df_known_builds.loc[(df_known_builds.job==job_name) & (df_known_builds.build_result.isna()), 'build'].tolist()
#builds_with_missing_info = builds_with_missing_info[:5]
#builds_with_missing_info = [my_job_status["lastCompletedBuild_number"], my_job_status["lastCompletedBuild_number"]-1, 1]
#builds_with_missing_info = [my_job_status["lastCompletedBuild_number"]-4, my_job_status["lastCompletedBuild_number"]-5, 3]
for build_number in builds_with_missing_info:
    print(f'Retrieving build {build_number} from "{job_name}"...\t', end='')

    # Shortcut to filter this build and job
    this_build_and_job = (df_known_builds.job==job_name) & (df_known_builds.build==build_number)

    # Retrieves the information about the own build
    build_info = get_build_summary(server, job_name, build_number)
    df_known_builds.loc[this_build_and_job, 'build_result'] = build_info['result']
    print(f"Build: {build_info['result']}\t", end='')
    #df_known_builds.loc[this_build_and_job, 'timestamp'] = int(build_info['timestamp'])
    df_known_builds.loc[this_build_and_job, 'timestamp'] = pd.to_datetime(build_info['timestamp'], unit='ms') # Unit in Jenkins for timestamps
    timestamp_translated = str(df_known_builds.loc[this_build_and_job, 'timestamp'])
    # timestamp_translated = df_known_builds.loc[this_build_and_job, 'timestamp'].dt.strftime('%Y-%m-%d')
    print(f"{timestamp_translated}({build_info['timestamp']})\t", end='')
    df_known_builds.loc[this_build_and_job, 'duration'] = build_info['duration']
    #df_known_builds.loc[this_build_and_job, 'duration'] = pd.to_timedelta(build_info['duration'], unit='ms')  # Unit in Jenkins for timestamps

    # Retrieves the Robot report, if it exists
    try:
        robot_report_contents = get_robot_report(server, job_name, build_number)
        with open(robot_report, 'w', encoding='utf-8') as f:
            print(robot_report_contents, file=f)

        print('Report available: ', end='')

        # Retrieves the rows that need to be added the corresponding database table, and appends them
        df_build_report = get_consolidated_results_from_report(robot_report, with_rca=True)
        df_build_report_details = get_detailed_results_from_report(robot_report)
        df_new_build_reports = pd.concat([df_new_build_reports, df_build_report], ignore_index=True)
        df_new_build_reports_details = pd.concat([df_new_build_reports_details, df_build_report_details], ignore_index=True)
        
        # Adds the build number to the new rows
        df_new_build_reports.build.fillna(build_number, inplace=True)
        df_new_build_reports_details.build.fillna(build_number, inplace=True)

        # Records the number of tests passed vs. failed
        df_known_builds.loc[this_build_and_job, 'pass_count'] = df_build_report['pass'].sum()
        df_known_builds.loc[this_build_and_job, 'fail_count'] = df_build_report['fail'].sum()

        # If any test is different from 'PASS', the whole build is marked as 'FAIL'
        if len(df_build_report.loc[df_build_report.status!='PASS']):
            # Job name will surely match, so there is no need to check it
            df_known_builds.loc[this_build_and_job, 'test_result'] = 'FAIL'
            print('FAIL')
        else:
            # Job name will surely match, so there is no need to check it
            df_known_builds.loc[this_build_and_job, 'test_result'] = 'PASS'
            print('PASS')
    except jenkins.NotFoundException as e:
        # If the Robot report could not be retrieved, it marks it as unavailable
        df_known_builds.loc[this_build_and_job, 'test_result'] = 'UNAVAILABLE'
        print('Report unavailable')

# All new rows should come from the same job
df_new_build_reports.job.fillna(job_name, inplace=True)
df_new_build_reports_details.job.fillna(job_name, inplace=True)

# Fixes the data types
df_new_build_reports['build'] = df_new_build_reports.build.astype('int')
df_new_build_reports['status'] = df_new_build_reports.status.astype('category')
df_new_build_reports_details['build'] = df_new_build_reports_details.build.astype('int')
df_new_build_reports_details['status'] = df_new_build_reports_details.status.astype('category')
#df_known_builds['timestamp'] = pd.to_datetime(df_known_builds.timestamp, unit='ms') # Unit in Jenkins for timestamps

#----
#df_known_builds['timestamp'] = pd.to_datetime(df_known_builds.timestamp)
#-----

#df_known_builds['duration'] = pd.to_timedelta(df_known_builds.duration, unit='ms')  # Unit in Jenkins for timedeltas
df_known_builds['build_result'] = df_known_builds.build_result.astype('category')
df_known_builds['test_result'] = df_known_builds.test_result.astype('category')
df_known_builds['pass_count'] = df_known_builds.pass_count.astype('float')
df_known_builds['fail_count'] = df_known_builds.fail_count.astype('float')

In [None]:
df_known_builds[df_known_builds.timestamp.notna()]

In [None]:
df_known_builds.info()

Saves the results to the database as a single transaction:

In [None]:
with engine.begin() as conn:
    df_known_builds.to_sql(name=table_known_builds, con=conn, if_exists='replace', index=False)
    df_new_build_reports.to_sql(name=table_robot_reports, con=conn, if_exists='append', index=False)
    df_new_build_reports_details.to_sql(name=table_robot_reports_extended, con=conn, if_exists='append', index=False)

In addition, saves the results as .CSV and .XLSX to allow quick access:

In [None]:
# Decides file names
csv_known_builds = os.path.join(outputs_folder, table_known_builds) + '.csv'
xlsx_known_builds = os.path.join(outputs_folder, table_known_builds) + '.xlsx'

df_known_builds.to_csv(csv_known_builds, index=False, sep=';')
df_known_builds.to_excel(xlsx_known_builds, index=False)

In [None]:
#dump_all_as_spreadsheets = True

In [None]:
if dump_all_as_spreadsheets:
    # Decides file names
    csv_robot_reports = os.path.join(outputs_folder, table_robot_reports) + '.csv'
    xlsx_robot_reports = os.path.join(outputs_folder, table_robot_reports) + '.xlsx'
    csv_robot_reports_extended = os.path.join(outputs_folder, table_robot_reports_extended) + '.csv'
    xlsx_robot_reports_extended = os.path.join(outputs_folder, table_robot_reports_extended) + '.xlsx'

    # Retrieves the full tables from the database
    with engine.begin() as conn:
        df_all_build_reports = pd.read_sql_table(table_robot_reports, con=conn)
        df_all_build_reports_details = pd.read_sql_table(table_robot_reports_extended, con=conn)

    # Dumps the full tables, now as spreadsheets
    df_all_build_reports.to_csv(csv_robot_reports, index=False, sep=';')
    df_all_build_reports.to_excel(xlsx_robot_reports, index=False)
    df_all_build_reports_details.to_csv(csv_robot_reports_extended, index=False, sep=';')
    df_all_build_reports_details.to_excel(xlsx_robot_reports_extended, index=False)

4. Populates the database with all builds from a set of relevant jobs

In [None]:
relevant_jobs = ['osm-stage_3-merge/v9.0', 'osm-stage_3-merge/master']

In [None]:
# Connection to the Jenkins server
server = jenkins.Jenkins(url_jenkins_server, username=username, password=password)

In [None]:
# Database setup
engine = create_engine(database_uri)

In [None]:
for job in relevant_jobs:
    ingest_update_all_jenkins_job(jenkins_server=server,
                                  job_name=job,
                                  database_engine=engine,
                                  robot_report=os.path.join(inputs_folder, input_robot_file))