# Scrape Risks from Confluence

"We basically want a master Risk register, which contains all the risks of the various projects, plus the program level risks"

In [1]:
import bs4
import pandas
import datetime
import os


## Set input and output paths

Links to files used in this test run:
- https://vcdi-dpc.atlassian.net/wiki/spaces/PROJ/pages/204374089/Project+Page+dummy+template+1
- https://vcdi-dpc.atlassian.net/wiki/spaces/PROJ/pages/206536717/Project+Page+dummy+template+2
- https://vcdi-dpc.atlassian.net/wiki/spaces/PM/pages/194183606/Risks

In [2]:
input_folder_path = \
"/Users/danielcorcoran/Desktop/github_repos/python_nb_data_pulling/confluence_scrape/risk data/inputs/"

output_folder_path = \
"/Users/danielcorcoran/Desktop/github_repos/python_nb_data_pulling/confluence_scrape/risk data/outputs/"


## Create list to store all tables

In [3]:
all_tables = []

## Process input folder
- Loop through each filename if it is .html continue
- Convert all tables in each filename to pandas dataframes and store in a list
- Check each table in list above if the column headers contain "Risk ID" add that table to all_tables list

In [4]:
for filename in os.listdir(input_folder_path):
    if ".html" in filename:
        print(filename,"is being processed")
        tables_list = pandas.read_html(input_folder_path + filename)
        for table in tables_list:
            column_names = list(table.columns)
            if "Risk ID" in column_names:
                all_tables.append(table)
    

risks.html is being processed
project_dummy_template_1.html is being processed
project_dummy_page_2.html is being processed


## Check shapes of all tables added to all_tables list, they should be identical in column numbers (the 2nd item in the tuple)

In [7]:
for table in all_tables:
    print(table.shape)

(3, 13)
(2, 13)
(2, 13)


## Union all the tables vertically

In [8]:
compiled_risk_table = pandas.concat(all_tables, axis = 0, sort = True)

##  Reset index of compiled risk table

In [11]:
compiled_risk_table = compiled_risk_table.reset_index(drop = True)

## Preview compiled risks table

In [12]:
compiled_risk_table

Unnamed: 0,Comments,DateLoggeddd/mm/yy,DateUpdateddd/mm/yy,Impact,Likelihood,MitigationControlsTAPs,Owner,Proj ID,Rating,Risk ID,RiskCategory,Status,Title/Description
0,,18/05/18,18/05/18,High,High,1,BP,0,9,1,Resourcing,Open,Delivery ability of the analytics team in ligh...
1,,18/05/18,18/05/18,Moderate,High,1,AF,0,6,2,Operational,Open,Delivery delays in data reform strategy
2,,18/05/18,18/05/18,Moderate,High,1,JH,0,6,3,Financial,Open,Funding
3,,18/05/18,18/05/18,High,High,abc,BP,0,9,R001,Resourcing,Open,Delivery ability of the analytics team in ligh...
4,,19/05/18,19/05/18,High,High,abcdef,BP,0,9,R002,Financial,Closed,Limited budget
5,,27/05/18,28/05/18,Medium,High,Work with stakeholders,TBC,1,6,R001,Scope,Closed,Scope creep
6,,28/05/18,29/05/18,High,Low,Extra resourcesRegular monitoring,TBC,2,8,R002,Schedule,Closed,Possible slippage


## Adjust Project ID

### A function to clean project ID 

In [10]:
def cleancode(var):
    var = str(var)
    var = var.strip()
    maxchar = len(var)
    if maxchar == 1:
        new_var = "00" + var
    elif maxchar == 2:
        new_var = "0" + var
    else:
        new_var = var
        
    return new_var

### Apply function to dataframe

In [18]:
for index in range(compiled_risk_table.shape[0]):
    temp_code = compiled_risk_table.loc[index, "Proj ID"]
    new_code = cleancode(temp_code)
    compiled_risk_table.loc[index,"Proj ID"] = new_code

000
000
000
000
000
001
002


## Check compiled risks table

In [20]:
compiled_risk_table

Unnamed: 0,Comments,DateLoggeddd/mm/yy,DateUpdateddd/mm/yy,Impact,Likelihood,MitigationControlsTAPs,Owner,Proj ID,Rating,Risk ID,RiskCategory,Status,Title/Description
0,,18/05/18,18/05/18,High,High,1,BP,0,9,1,Resourcing,Open,Delivery ability of the analytics team in ligh...
1,,18/05/18,18/05/18,Moderate,High,1,AF,0,6,2,Operational,Open,Delivery delays in data reform strategy
2,,18/05/18,18/05/18,Moderate,High,1,JH,0,6,3,Financial,Open,Funding
3,,18/05/18,18/05/18,High,High,abc,BP,0,9,R001,Resourcing,Open,Delivery ability of the analytics team in ligh...
4,,19/05/18,19/05/18,High,High,abcdef,BP,0,9,R002,Financial,Closed,Limited budget
5,,27/05/18,28/05/18,Medium,High,Work with stakeholders,TBC,1,6,R001,Scope,Closed,Scope creep
6,,28/05/18,29/05/18,High,Low,Extra resourcesRegular monitoring,TBC,2,8,R002,Schedule,Closed,Possible slippage


## Export Compiled Risks Table 

In [22]:
compiled_risk_table.to_csv(output_folder_path + "compiled_risk_table.csv", 
                          index = False)