# Canvas Pull

## About
- Parses **'html file'** extracted from vcdi's confluence page and exports .csvs containing canvas information

## How to obtain 'html file'
- Load canvas page in confluence
- Right click, inspect element
- Right click on html body, copy inner code
- Paste in editor (notepad++, sublime etc)
- Save as .html
- Link path into this script

## Setup

In [1]:
import pandas
import requests
import datetime
import bs4

timestamp = datetime.datetime.now().strftime("%d-%b-%y")

In [2]:
"today is: " + timestamp

'today is: 06-Jun-18'

In [3]:
#output folder path
output_folder_path = "/users/danielcorcoran/desktop/Confluence Files/outputs/canvas/"

input_html_file_path = "/users/danielcorcoran/desktop/Confluence Files/inputs/canvas_page.html"
#input_html_file_path = "/users/danielcorcoran/desktop/farm.html"

In [4]:
soup = bs4.BeautifulSoup(open(input_html_file_path), "html.parser")

### How each cell's contents will be stored

### Create dictionaries to store data

In [5]:
project_info = {"project_code": [], "project_name": [], "project_start_date": [],
                "project_current_phase": [], "project_status": [], "problem_summary": []}

teams = {"project_code": [], "team_member_desc": []}

stakeholders = {"project_code": [], "stakeholder_desc": []}

buildingblocks = {"project_code": [], "building_blocks_desc": []}

analytics = {"project_code": [], "analytics_desc": []}

project_outcomes = {"project_code": [], "project_outcome_desc": []}

benefits = {"project_code": [], "benefits_desc": []}

milestones = {"project_code": [], "milestone_desc": []}

budgetcost = {"project_code": [], "budget_cost_desc": []}

issues = {"project_code": [], "issues_desc": []}

risks = {"project_code": [], "risks_desc": []}

## Functions

### Remove last two chars from string

In [6]:
def remove_last_two_chars(string):
    
    if len(string) > 3:
        string = string[:len(string)-3]
        
    return string

### Clean a string

In [7]:
def clean_a_string(string):
    string = string.upper().strip()
    
    while "  " in string:
        string = string.replace("  "," ")
        
    while "," in string:
        string = string.replace(","," ")

    while "'" in string:
        string = string.replace("'"," ")
        
    string = string.replace("\xa0"," ")
        
    string = string.strip()
    
    return string

### Remove h3 tags from a string

In [8]:
def remove_h3_tags(string):
    
    unwanted_phrases = ['PROJECT CODE:',
                        'PROJECT NAME:',
                        'PROJECT TEAM:',
                        'START DATE:',
                        'CURRENT PHASE:',
                        'STATUS:',
                        'STAKEHOLDERS:',
                        'BUILDING BLOCKS OUTCOMES:',
                        'TYPES OF ANALYTICS:',
                        'PROBLEM SUMMARY:',
                        'PROJECT OUTCOMES:',
                        'BENEFITS:',
                        'HIGH LEVEL MILESTONES (WITH EXPECTED START AND END DATES):',
                        'BUDGET & COSTS:',
                        'RISKS:',
                        'ISSUES:']
    
    for phrase in unwanted_phrases:
        if phrase in string:
            string = string.replace(phrase, " ")
    
    string = string.strip()
    
    return string

## Start Processing 

### Target all confluenceTd elements (table cells) store them in list

In [9]:
cells = soup.find_all("td", {"class":"confluenceTd"})

### Locate Project Code

In [10]:
for table_cell in cells:
    
    if table_cell.find("h3"):
        
        h3_text = table_cell.find("h3").text
        h3_text_cleaned = clean_a_string(h3_text)
        
        if h3_text_cleaned == "PROJECT CODE:":
            project_code = remove_h3_tags(clean_a_string(table_cell.text))
            
project_code

'150'

### Locate Project Name

In [11]:
for table_cell in cells:
    
    if table_cell.find("h3"):
        
        h3_text = table_cell.find("h3").text
        h3_text_cleaned = clean_a_string(h3_text)
        
        if h3_text_cleaned == "PROJECT NAME:":
            project_name = remove_h3_tags(clean_a_string(table_cell.text))
            
project_name

'EXAMPLE PROJECT NAME'

### Locate Problem Summary

In [12]:
for table_cell in cells:
    
    if table_cell.find("h3"):
        
        h3_text = table_cell.find("h3").text
        h3_text_cleaned = clean_a_string(h3_text)
        
        if h3_text_cleaned == "PROBLEM SUMMARY:":
            problem_summary = remove_h3_tags(clean_a_string(table_cell.text))
            
problem_summary

'REVENUE FROM SPEED AND RED LIGHT CAMERAS ALONE ARE EXPECTED TO BE $393 MILLION IN 2017-18  INCREASING TO $421 MILLION BY 2020-21.THE BUDGET ALSO ESTIMATES REVENUE FROM ON-THE-SPOT FINES AND TOLL ROAD EVASION WILL CONTINUE TO GROW.THERE ARE CURRENTLY APPROXIMATELY $2 BILLION UNPAID FINES WITH THE AVERAGE BEING GREATER THAN 1 YEAR OLD.'

### Locate Start Date

In [13]:
for table_cell in cells:
    
    if table_cell.find("h3"):
        
        h3_text = table_cell.find("h3").text
        h3_text_cleaned = clean_a_string(h3_text)
        
        if h3_text_cleaned == "START DATE:":
            project_start_date = remove_h3_tags(clean_a_string(table_cell.text))
            
project_start_date

'18 OCT 2017'

### Locate Current Phase 

In [14]:
for table_cell in cells:
    
    if table_cell.find("h3"):
        
        h3_text = table_cell.find("h3").text
        h3_text_cleaned = clean_a_string(h3_text)
        
        if h3_text_cleaned == "CURRENT PHASE:":
            project_phase = remove_h3_tags(clean_a_string(table_cell.text))
            
project_phase

'DISCOVERY'

### Locate Status

In [15]:
for table_cell in cells:
    
    if table_cell.find("h3"):
        
        h3_text = table_cell.find("h3").text
        h3_text_cleaned = clean_a_string(h3_text)
        
        if h3_text_cleaned == "STATUS:":
            project_status = remove_h3_tags(clean_a_string(table_cell.text))
            
project_status

'GREEN'

### Add to projects table 

In [16]:
project_info["project_code"].append(project_code)
project_info["project_name"].append(project_name)
project_info["project_start_date"].append(project_start_date)
project_info["project_status"].append(project_status)
project_info["problem_summary"].append(problem_summary)
project_info["project_current_phase"].append(project_phase)

### Set sep variable

In [17]:
sep = " | "

### Locate Project Team

In [18]:
teams_combined = ""

for table_cell in cells:
    
    if table_cell.find("h3"):
        
        h3_text = table_cell.find("h3").text
        h3_text_cleaned = clean_a_string(h3_text)
        
        if h3_text_cleaned == "PROJECT TEAM:":
            list_members = table_cell.find("ul")
            for item in list_members:
                cleaned_item = clean_a_string(item.text)
                teams["team_member_desc"].append(cleaned_item)
                teams_combined = teams_combined + cleaned_item + sep
                teams["project_code"].append(project_code)
                
teams

{'project_code': ['150', '150', '150'],
 'team_member_desc': ['NAME  -  POSITION TITLE .',
  'NAME  -  POSITION TITLE .',
  'NAME  -  POSITION TITLE .']}

### Locate Stakeholders

In [19]:
stakeholders_combined = ""

for table_cell in cells:
    
    if table_cell.find("h3"):
        
        h3_text = table_cell.find("h3").text
        h3_text_cleaned = clean_a_string(h3_text)
        
        if h3_text_cleaned == "STAKEHOLDERS:":
            list_members = table_cell.find("ul")
            for item in list_members:
                cleaned_item = clean_a_string(item.text)
                stakeholders["stakeholder_desc"].append(cleaned_item)
                stakeholders_combined = stakeholders_combined + cleaned_item + sep
                stakeholders["project_code"].append(project_code)
                
stakeholders

{'project_code': ['150', '150', '150'],
 'stakeholder_desc': ['DEPARTMENT AND BUSINESS UNIT -  NAME .',
  'NAME OF EXECUTIVE SPONSOR -  NAME .',
  'NAME OF CONTACT PERSON -  NAME .']}

### Locate Project Outcomes

In [20]:
project_outcomes_combined = ""

for table_cell in cells:
    
    if table_cell.find("h3"):
        
        h3_text = table_cell.find("h3").text
        h3_text_cleaned = clean_a_string(h3_text)
        
        if h3_text_cleaned == "PROJECT OUTCOMES:":
            list_members = table_cell.find("ul")
            for item in list_members:
                cleaned_item = clean_a_string(item.text)
                project_outcomes["project_outcome_desc"].append(cleaned_item)
                project_outcomes_combined = project_outcomes_combined + cleaned_item + sep
                project_outcomes["project_code"].append(project_code)
                
project_outcomes

{'project_code': ['150', '150', '150'],
 'project_outcome_desc': ['REDUCE AGED DEBT BY 10% WITHIN 12 MONTHS.',
  'REDUCE AVERAGE AGE OF OVERDUE FEES TO LESS THAN 1 YEAR WITHIN 6 MONTHS.',
  'REDUCE OPERATIONAL COSTS BY 10% BY 2020.']}

### Locate Benefits 

In [21]:
benefits_combined = ""

for table_cell in cells:
    
    if table_cell.find("h3"):
        
        h3_text = table_cell.find("h3").text
        h3_text_cleaned = clean_a_string(h3_text)
        
        if h3_text_cleaned == "BENEFITS:":
            list_members = table_cell.find("ul")
            for item in list_members:
                cleaned_item = clean_a_string(item.text)
                benefits["benefits_desc"].append(cleaned_item)
                benefits_combined = benefits_combined + cleaned_item + sep
                benefits["project_code"].append(project_code)
                
benefits

{'project_code': ['150', '150', '150', '150'],
 'benefits_desc': ['ADVANCE IMES S SELF-SERVICE ANALYTICS CAPABILITIES.',
  'REDUCE AGED DEBT.',
  'REDUCE OPERATIONAL COSTS OF MANAGING AND COLLECTING.',
  'CHANGE BEHAVIOUR OF FUTURE DEFENDANTS.']}

### Locate Budget & Costs

In [22]:
budgetcost_combined= ""

for table_cell in cells:
    
    if table_cell.find("h3"):
        
        h3_text = table_cell.find("h3").text
        h3_text_cleaned = clean_a_string(h3_text)
        
        if h3_text_cleaned == "BUDGET & COSTS:":
            list_members = table_cell.find("ul")
            for item in list_members:
                cleaned_item = clean_a_string(item.text)
                budgetcost["budget_cost_desc"].append(cleaned_item)
                budgetcost_combined = budgetcost_combined + cleaned_item + sep
                budgetcost["project_code"].append(project_code)
                
budgetcost

{'project_code': ['150', '150'],
 'budget_cost_desc': ['BUDGET - $12.', 'COSTS - $5.']}

### Locate Issues

In [23]:
issues_combined = ""

for table_cell in cells:
    
    if table_cell.find("h3"):
        
        h3_text = table_cell.find("h3").text
        h3_text_cleaned = clean_a_string(h3_text)
        
        if h3_text_cleaned == "ISSUES:":
            list_members = table_cell.find("ul")
            for item in list_members:
                cleaned_item = clean_a_string(item.text)
                issues["issues_desc"].append(cleaned_item)
                issues_combined = issues_combined + cleaned_item + sep
                issues["project_code"].append(project_code)
                
issues

{'project_code': ['150', '150', '150', '150'],
 'issues_desc': ['ISSUE 1 - INFORMATION ABOUT ISSUE 1.',
  'ISSUE 2 - INFORMATION ABOUT ISSUE 2.',
  'ISSUE 3 - INFORMATION ABOUT ISSUE 3.',
  'ISSUE 4 - INFORMATION ABOUT ISSUE 4.']}

### Locate Risks

In [24]:
risks_combined = ""

for table_cell in cells:
    
    if table_cell.find("h3"):
        
        h3_text = table_cell.find("h3").text
        h3_text_cleaned = clean_a_string(h3_text)
        
        if h3_text_cleaned == "RISKS:":
            list_members = table_cell.find("ul")
            for item in list_members:
                cleaned_item = clean_a_string(item.text)
                risks["risks_desc"].append(cleaned_item)
                risks_combined = risks_combined + cleaned_item + sep
                risks["project_code"].append(project_code)

risks

{'project_code': ['150', '150', '150', '150'],
 'risks_desc': ['RISK 1 - INFORMATION ABOUT RISK 1.',
  'RISK 2 - INFORMATION ABOUT RISK 2.',
  'RISK 3 - INFORMATION ABOUT RISK 3.',
  'RISK 4 - INFORMATION ABOUT RISK 4.']}

### Locate Milestones

In [25]:
milestones_combined = ""

for table_cell in cells:
    
    if table_cell.find("h3"):
        
        h3_text = table_cell.find("h3").text
        h3_text_cleaned = clean_a_string(h3_text)
        
        if h3_text_cleaned == "HIGH LEVEL MILESTONES (WITH EXPECTED START AND END DATES):":
            list_members = table_cell.find("ul")
            for item in list_members:
                cleaned_item = clean_a_string(item.text)
                milestones["milestone_desc"].append(cleaned_item)
                milestones_combined = milestones_combined + cleaned_item + sep
                milestones["project_code"].append(project_code)
                
milestones

{'project_code': ['150', '150', '150'],
 'milestone_desc': ['INFORMATION ABOUT MILESTONE 1  TIMEFRAME:01 MAY 2018 TO 23 MAY 2018.',
  'INFORMATION ABOUT MILESTONE 2  TIMEFRAME:03 MAY 2018 TO 24 MAY 2018.',
  'INFORMATION ABOUT MILESTONE 3  TIMEFRAME:07 MAY 2018 TO 22 MAY 2018.']}

### Locate Analytics Types

In [26]:
analytics_combined = ""

for table_cell in cells:
    
    if table_cell.find("h3"):
        
        h3_text = table_cell.find("h3").text
        h3_text_cleaned = clean_a_string(h3_text)
        
        if h3_text_cleaned == "TYPES OF ANALYTICS:":
            
            unordered_list = table_cell.find("ul")
            list_items = unordered_list.find_all("li")
            
            for item in list_items:
                item_class = item.get("class")
                
                if item_class == ["checked"]:
                    cleaned_item = clean_a_string(item.text)
                    analytics["analytics_desc"].append(cleaned_item)
                    analytics_combined = analytics_combined + cleaned_item + sep
                    analytics["project_code"].append(project_code)
                    
analytics

{'project_code': ['150'], 'analytics_desc': ['PRESCRIPTIVE']}

### Locate Building Blocks

In [27]:
buildingblocks_combined = ""

for table_cell in cells:
    
    if table_cell.find("h3"):
        
        h3_text = table_cell.find("h3").text
        h3_text_cleaned = clean_a_string(h3_text)
        
        if h3_text_cleaned == "BUILDING BLOCKS OUTCOMES:":
            
            unordered_list = table_cell.find("ul")
            list_items = unordered_list.find_all("li")
            
            for item in list_items:
                item_class = item.get("class")
                
                if item_class == ["checked"]:
                    cleaned_item = clean_a_string(item.text)
                    buildingblocks["building_blocks_desc"].append(cleaned_item)
                    buildingblocks_combined = buildingblocks_combined + cleaned_item + sep
                    buildingblocks["project_code"].append(project_code)
                    
buildingblocks

{'project_code': ['150', '150'],
 'building_blocks_desc': ['JOBS NOW', 'FAIRNESS AND EQUITY']}

## Create Dataframes from each dictionary and export to csv with timestamp and row index

In [28]:
project_info_data = pandas.concat([pandas.DataFrame(project_info)])

project_info_data["date_extracted"] = timestamp

project_info_data.to_csv(output_folder_path + "project_info_data_"+str(project_code)+".csv", 
                         index_label = "project_info_row_index")

In [29]:
teams_data = pandas.concat([pandas.DataFrame(teams)])

teams_data["date_extracted"] = timestamp

teams_data.to_csv(output_folder_path + "teams_data_"+str(project_code)+".csv", 
                         index_label = "teams_row_index")

In [30]:
building_blocks_data = pandas.concat([pandas.DataFrame(buildingblocks)])

building_blocks_data["date_extracted"] = timestamp

building_blocks_data.to_csv(output_folder_path + "building_blocks_data_"+str(project_code)+".csv", 
                         index_label = "building_blocks_row_index")

In [31]:
analytics_data = pandas.concat([pandas.DataFrame(analytics)])

analytics_data["date_extracted"] = timestamp

analytics_data.to_csv(output_folder_path + "analytics_data_"+str(project_code)+".csv", 
                         index_label = "analytics_row_index")

In [32]:
project_outcomes_data = pandas.concat([pandas.DataFrame(project_outcomes)])

project_outcomes_data["date_extracted"] = timestamp

project_outcomes_data.to_csv(output_folder_path + "project_outcomes_data_"+str(project_code)+".csv", 
                         index_label = "project_outcomes_row_index")

In [33]:
benefits_data = pandas.concat([pandas.DataFrame(benefits)])

benefits_data["date_extracted"] = timestamp

benefits_data.to_csv(output_folder_path + "benefits_data_"+str(project_code)+".csv", 
                         index_label = "benefits_row_index")

In [34]:
milestones_data = pandas.concat([pandas.DataFrame(milestones)])

milestones_data["date_extracted"] = timestamp

milestones_data.to_csv(output_folder_path + "milestones_data_"+str(project_code)+".csv", 
                         index_label = "milestones_row_index")

In [35]:
budgetcost_data = pandas.concat([pandas.DataFrame(budgetcost)])

budgetcost_data["date_extracted"] = timestamp

budgetcost_data.to_csv(output_folder_path + "budgetcost_data_"+str(project_code)+".csv", 
                         index_label = "budgetcost_row_index")

In [36]:
issues_data = pandas.concat([pandas.DataFrame(issues)])

issues_data["date_extracted"] = timestamp

issues_data.to_csv(output_folder_path + "issues_data_"+str(project_code)+".csv", 
                         index_label = "issues_row_index")

In [37]:
risks_data = pandas.concat([pandas.DataFrame(risks)])

risks_data["date_extracted"] = timestamp

risks_data.to_csv(output_folder_path + "risks_data_"+str(project_code)+".csv", 
                         index_label = "risks_row_index")

### Process Combined Table

In [38]:
combined = {
	"project_code":[project_code],
	"project_name":[project_name],
	"problem_summary":[problem_summary],
	"project_start_date":[project_start_date],
	"project_phase":[project_phase],
	"project_status":[project_status],
	"project_team":[remove_last_two_chars(teams_combined)],
	"project_stakeholders":[remove_last_two_chars(stakeholders_combined)],
	"project_outcomes":[remove_last_two_chars(project_outcomes_combined)],
	"project_benefits":[remove_last_two_chars(benefits_combined)],
	"project_budget_cost":[remove_last_two_chars(budgetcost_combined)],
	"project_issues":[remove_last_two_chars(issues_combined)],
	"project_risks":[remove_last_two_chars(risks_combined)],
	"project_milestones":[remove_last_two_chars(milestones_combined)],
	"project_analytics":[remove_last_two_chars(analytics_combined)],
	"project_building_blocks":[remove_last_two_chars(buildingblocks_combined)],
	"extract_date":[timestamp]
}

In [39]:
combined_dataframe = pandas.DataFrame(combined)

combined_dataframe

Unnamed: 0,project_code,project_name,problem_summary,project_start_date,project_phase,project_status,project_team,project_stakeholders,project_outcomes,project_benefits,project_budget_cost,project_issues,project_risks,project_milestones,project_analytics,project_building_blocks,extract_date
0,150,EXAMPLE PROJECT NAME,REVENUE FROM SPEED AND RED LIGHT CAMERAS ALONE...,18 OCT 2017,DISCOVERY,GREEN,NAME - POSITION TITLE . | NAME - POSITION ...,DEPARTMENT AND BUSINESS UNIT - NAME . | NAME ...,REDUCE AGED DEBT BY 10% WITHIN 12 MONTHS. | RE...,ADVANCE IMES S SELF-SERVICE ANALYTICS CAPABILI...,BUDGET - $12. | COSTS - $5.,ISSUE 1 - INFORMATION ABOUT ISSUE 1. | ISSUE 2...,RISK 1 - INFORMATION ABOUT RISK 1. | RISK 2 - ...,INFORMATION ABOUT MILESTONE 1 TIMEFRAME:01 MA...,PRESCRIPTIVE,JOBS NOW | FAIRNESS AND EQUITY,06-Jun-18


#### Export Combined Table

In [40]:
combined_dataframe.to_csv(output_folder_path + "combined_canvas_data_" + str(project_code) + ".csv", index = False)