# Combine Project Pages

### Input Folder

In [1]:
input_folder_path = "/Users/danielcorcoran/Desktop/github_repos/python_nb_data_pulling/confluence_scrape/data/outputs/project_page/"
output_folder_path = "/Users/danielcorcoran/Desktop/github_repos/python_nb_data_pulling/confluence_scrape/data/outputs/project_page_combined/"

### Modules

In [2]:
import os
import pandas
import numpy
import math

### Read in each dataframe in loop and add to list

In [3]:
dataframe_list = []

for filename in os.listdir(input_folder_path):
    
    data = pandas.read_csv(input_folder_path + filename)
    dataframe_list.append(data)

### Merge all dataframes in list into one, union vertically

In [4]:
data = pandas.concat(dataframe_list, 
                     axis = 0, 
                     sort = True)
data

Unnamed: 0.1,1 - Pre Project,2 - Feasibility,3 - Foundations,4 - Development,5 - Delivery,6 - Closure,7 - Post Project,Complexity,Contacts / Description,Current Phase,...,Project Lead,Project Selection Score,Project Start,Project Title,Stakeholder,Status Update,Unnamed: 0,VCDI Stream,Yes / No Health,Yes / No Personal
0,COMPLETE,COMPLETE,COMPLETE,COMPLETE,COMPLETE,COMPLETE,,7.0,TBC,7 - Post Project,...,Suhith Illesinghe,75% (324/430),01 Mar 2017,Building Cladding TaskForce,Cladding TaskForce,Exec summary,,Analytics,No,Yes
1,COMPLETE,COMPLETE,COMPLETE,COMPLETE,COMPLETE,COMPLETE,,7.0,TBC,7 - Post Project,...,Suhith Illesinghe,75% (324/430),01 Mar 2017,Building Cladding TaskForce,VBA,Exec summary,,Analytics,No,Yes
0,,,,,COMPLETE,COMPLETE,COMPLETE,,TBC,Closed,...,Jan Lambrechts,,01 Mar 2017,Privacy by Design Advice,Salinger,The project is complete and has been closed off.,,Policy,No,No
0,COMPLETE,,,3-GREEN,,,,,,Development,...,Natasha Thompson,,01 Apr 2018,Develop best practice data analytics lifecycle...,,Exec summary and status of the project.,,Data Reform,No,No


In [5]:
data.to_csv("/users/danielcorcoran/Desktop/final_page_data.csv", index = False)

### Reset Index

In [6]:
data = data.reset_index(drop = True)

### A function to clean project codes 

In [7]:
def cleancode(var):
    
    var = str(var)
    var = var.strip()
    maxchar = len(var)
    
    if maxchar == 1:
        new_var = "00" + var
    elif maxchar == 2:
        new_var = "0" + var
    else:
        new_var = var
        
    return new_var

### Check Project ID column

In [8]:
data["Project ID"]

0      0.0
1      0.0
2      1.0
3    121.0
Name: Project ID, dtype: float64

### Use project code cleaning function on data

In [9]:
for index in range(data.shape[0]):

    item = math.floor(data.loc[index, "Project ID"])
    cleaned_item = cleancode(item)
    data.loc[index, "Project ID"] = cleaned_item
    
    print(item, "at index", index, "was transformed to", cleaned_item)

0 at index 0 was transformed to 000
0 at index 1 was transformed to 000
1 at index 2 was transformed to 001
121 at index 3 was transformed to 121


### Check Project ID column

In [10]:
data["Project ID"]

0    000
1    000
2    001
3    121
Name: Project ID, dtype: object

### Drop unwanted columns

In [11]:
if "Unnamed: 0" in list(data.columns):

    data.drop(["Unnamed: 0"], axis=1, inplace=True)
    data.shape

## Sort the data 

In [12]:
data.columns

Index(['1 - Pre Project', '2 - Feasibility', '3 - Foundations',
       '4 - Development', '5 - Delivery', '6 - Closure', '7 - Post Project',
       'Complexity', 'Contacts / Description', 'Current Phase',
       'Current State', 'Executive Sponsor', 'Key Stakeholder', 'Last Update',
       'Maturity', 'Nature of Data / Info Used Health',
       'Nature of Data / Info Used Personal', 'Objective', 'Outcomes',
       'Overall Status', 'Priority', 'Project End', 'Project ID',
       'Project Lead', 'Project Selection Score', 'Project Start',
       'Project Title', 'Stakeholder', 'Status Update', 'VCDI Stream',
       'Yes / No Health', 'Yes / No Personal'],
      dtype='object')

In [13]:
data_sort = data[[
    'Project ID',
    'Project Title',
    'Last Update',
    'VCDI Stream',
    'Project Lead',
    'Key Stakeholder',
    'Executive Sponsor',
    'Project Start',
    'Project End',
    'Current State',
    'Project Selection Score',
    'Priority',
    'Maturity',
    'Complexity',
    '1 - Pre Project',
    '2 - Feasibility',
    '3 - Foundations',
    '4 - Development',
    '5 - Delivery',
    '6 - Closure',
    '7 - Post Project',
    'Current Phase',
    'Overall Status',
    'Status Update',
    'Yes / No Personal',
    'Yes / No Health',
    'Nature of Data / Info Used Personal',
    'Nature of Data / Info Used Health',
    'Stakeholder',
    'Contacts / Description',
    'Objective',
    'Outcomes',
]]

In [14]:
data_sort

Unnamed: 0,Project ID,Project Title,Last Update,VCDI Stream,Project Lead,Key Stakeholder,Executive Sponsor,Project Start,Project End,Current State,...,Overall Status,Status Update,Yes / No Personal,Yes / No Health,Nature of Data / Info Used Personal,Nature of Data / Info Used Health,Stakeholder,Contacts / Description,Objective,Outcomes
0,0,Building Cladding TaskForce,24 May 2018,Analytics,Suhith Illesinghe,TaskForce,TBC,01 Mar 2017,30 Aug 2017,Inactive,...,5-COMPLETE,Exec summary,Yes,No,Detailed description about PERSONAL data break...,Detailed description about HEALTH data breakdo...,Cladding TaskForce,TBC,This is the objective of the project,ab
1,0,Building Cladding TaskForce,24 May 2018,Analytics,Suhith Illesinghe,TaskForce,TBC,01 Mar 2017,30 Aug 2017,Inactive,...,5-COMPLETE,Exec summary,Yes,No,Detailed description about PERSONAL data break...,Detailed description about HEALTH data breakdo...,VBA,TBC,This is the objective of the project,ab
2,1,Privacy by Design Advice,07 Jun 2018,Policy,Jan Lambrechts,Julian Hebden,Julian Hebden,01 Mar 2017,30 Aug 2017,Inactive,...,5-COMPLETE,The project is complete and has been closed off.,No,No,,,Salinger,TBC,To get Privacy by Design Advice as it's relate...,The Salinger report informed development of VC...
3,121,Develop best practice data analytics lifecycle...,06 Jun 2018,Data Reform,Natasha Thompson,Brad Petry,Julian Hebden,01 Apr 2018,31 Dec 2018,Active,...,3-GREEN,Exec summary and status of the project.,No,No,,,,,The objective of the project.,Best practice data analytics lifecycle methodo...


## Fill nulls with N/A

In [15]:
data_sort_filled = data_sort.fillna("N/A")

In [16]:
data_sort_filled

Unnamed: 0,Project ID,Project Title,Last Update,VCDI Stream,Project Lead,Key Stakeholder,Executive Sponsor,Project Start,Project End,Current State,...,Overall Status,Status Update,Yes / No Personal,Yes / No Health,Nature of Data / Info Used Personal,Nature of Data / Info Used Health,Stakeholder,Contacts / Description,Objective,Outcomes
0,0,Building Cladding TaskForce,24 May 2018,Analytics,Suhith Illesinghe,TaskForce,TBC,01 Mar 2017,30 Aug 2017,Inactive,...,5-COMPLETE,Exec summary,Yes,No,Detailed description about PERSONAL data break...,Detailed description about HEALTH data breakdo...,Cladding TaskForce,TBC,This is the objective of the project,ab
1,0,Building Cladding TaskForce,24 May 2018,Analytics,Suhith Illesinghe,TaskForce,TBC,01 Mar 2017,30 Aug 2017,Inactive,...,5-COMPLETE,Exec summary,Yes,No,Detailed description about PERSONAL data break...,Detailed description about HEALTH data breakdo...,VBA,TBC,This is the objective of the project,ab
2,1,Privacy by Design Advice,07 Jun 2018,Policy,Jan Lambrechts,Julian Hebden,Julian Hebden,01 Mar 2017,30 Aug 2017,Inactive,...,5-COMPLETE,The project is complete and has been closed off.,No,No,,,Salinger,TBC,To get Privacy by Design Advice as it's relate...,The Salinger report informed development of VC...
3,121,Develop best practice data analytics lifecycle...,06 Jun 2018,Data Reform,Natasha Thompson,Brad Petry,Julian Hebden,01 Apr 2018,31 Dec 2018,Active,...,3-GREEN,Exec summary and status of the project.,No,No,,,,,The objective of the project.,Best practice data analytics lifecycle methodo...


### Export

In [17]:
data_sort_filled.to_csv(output_folder_path + "final_page_data.csv", index = False)