# Data Cleaning & Integration

## Extract from zip folder to dataframes.

In [13]:
# Import Dependencies
import os
import pandas as pd
from zipfile import ZipFile

In [15]:
# Unzip folder and extract each csv file into dataframes.
zf = ZipFile(r'C:\Users\haiyo\DC\Construction_Project_Management_Report_Analysis\Raw_Data\archive (1).zip')
Forms_df = pd.read_csv(zf.open('Construction_Data_PM_Forms_All_Projects.csv'))
Tasks_df = pd.read_csv(zf.open('Construction_Data_PM_Tasks_All_Projects.csv'))

## Inspect, clean, and Transform Forms dataframe.

In [25]:
# Check any null data.
Forms_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10254 entries, 0 to 10253
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Ref                  10254 non-null  object
 1   Status               10254 non-null  object
 2   Location             10254 non-null  object
 3   Name                 10254 non-null  object
 4   Created              10254 non-null  object
 5   Type                 10254 non-null  object
 6   Status Changed       10254 non-null  object
 7   Open Actions         10254 non-null  int64 
 8   Total Actions        10254 non-null  int64 
 9   Association          2098 non-null   object
 10  OverDue              10254 non-null  bool  
 11  Images               10254 non-null  bool  
 12  Comments             10254 non-null  bool  
 13  Documents            9450 non-null   object
 14  Project              10254 non-null  int64 
 15  Report Forms Status  10252 non-null  object
 16  Repo

In [30]:
Forms_df.head(45)
# Observations & Tasks:
# How many forms are 'Opened', 'Open / Ongoing Works', etc. 
# 'Association' may talk about parent and children document relationships. Find out how the children documents are recorded.
# What is the percentage breakdown of the 'Location', 'Report Forms Group', or 'Name' of the forms?
# Are there any forms that have different 'Created' and 'Status Changed' dates? What about 'OverDue'?
# Check if 'Project' is uniform. 
# Check if any forms were closed without a '0' for 'Open Actions', no alarming signs in 'OverDue', and 'Report Forms Status'.

# Other than the date format change, and perhaps the 'Report Forms Status' and 'Report Forms Group' null datapoints, feel fine for now in data cleaning.

Unnamed: 0,Ref,Status,Location,Name,Created,Type,Status Changed,Open Actions,Total Actions,Association,OverDue,Images,Comments,Documents,Project,Report Forms Status,Report Forms Group
0,F145185.4,Opened,01 Daily Site Diary>Site Management>JPC Projec...,1328 CM-SM-FRM-001 Site Diary,15/09/2020,Site Management,15/09/2020,0,0,,False,True,False,False,1328,Open,Site Management
1,F1.495500,Open / Ongoing Works,02 Daily Work Plan>Site Management>JPC Project...,SM-FRM-SUB-101 Daily Work Plan,15/09/2020,Subcontractor Inspections,15/09/2020,0,0,,False,False,False,False,1328,Open,Subcontractor
2,F1.495499,Open / Ongoing Works,02 Daily Work Plan>Site Management>JPC Project...,SM-FRM-SUB-101 Daily Work Plan,15/09/2020,Subcontractor Inspections,15/09/2020,0,0,,False,False,False,False,1328,Open,Subcontractor
3,F1.495498,Open / Ongoing Works,02 Daily Work Plan>Site Management>JPC Project...,SM-FRM-SUB-101 Daily Work Plan,15/09/2020,Subcontractor Inspections,15/09/2020,0,0,,False,False,False,False,1328,Open,Subcontractor
4,F1.495496,Open / Ongoing Works,02 Daily Work Plan>Site Management>JPC Project...,SM-FRM-SUB-101 Daily Work Plan,15/09/2020,Subcontractor Inspections,15/09/2020,0,0,,False,False,False,False,1328,Open,Subcontractor
5,F1.495479,Open / Ongoing Works,02 Daily Work Plan>Site Management>JPC Project...,SM-FRM-SUB-101 Daily Work Plan,15/09/2020,Subcontractor Inspections,15/09/2020,0,0,,False,False,False,False,1328,Open,Subcontractor
6,F124541.22,Open / Ongoing Works,02 Daily Work Plan>Site Management>JPC Project...,SM-FRM-SUB-101 Daily Work Plan,14/09/2020,Subcontractor Inspections,14/09/2020,0,0,,False,False,False,False,1328,Open,Subcontractor
7,F124541.21,Open / Ongoing Works,02 Daily Work Plan>Site Management>JPC Project...,SM-FRM-SUB-101 Daily Work Plan,14/09/2020,Subcontractor Inspections,14/09/2020,0,0,,False,False,False,False,1328,Open,Subcontractor
8,F1.495343,Subcontractor Signed Off,02 Daily Work Plan>Site Management>JPC Project...,SM-FRM-SUB-101 Daily Work Plan,14/09/2020,Subcontractor Inspections,14/09/2020,0,0,,False,False,False,False,1328,Closed,Subcontractor
9,F1.495319,Open / Ongoing Works,02 Daily Work Plan>Site Management>JPC Project...,SM-FRM-SUB-101 Daily Work Plan,14/09/2020,Subcontractor Inspections,14/09/2020,0,0,,False,False,False,False,1328,Open,Subcontractor


In [28]:
# Get dataframe info.
Tasks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12424 entries, 0 to 12423
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Ref             12424 non-null  object 
 1   Status          12424 non-null  object 
 2   Location        12424 non-null  object 
 3   Description     12424 non-null  object 
 4   Created         12424 non-null  object 
 5   Target          2568 non-null   float64
 6   Type            12424 non-null  object 
 7   To Package      11382 non-null  object 
 8   Status Changed  12424 non-null  object 
 9   Association     9483 non-null   object 
 10  OverDue         12424 non-null  bool   
 11  Images          12272 non-null  object 
 12  Comments        11902 non-null  object 
 13  Documents       11780 non-null  object 
 14  Priority        2366 non-null   object 
 15  Cause           9683 non-null   object 
 16  project         12424 non-null  int64  
 17  Report Status   12424 non-null 

In [29]:
Tasks_df.head(30)
# Observations & Tasks: 
# Change all date formats from the European DD/MM/YYYY to MM/DD/YYYY. 
# 'Target' column is in a date number format that should be converted to date format.
# 'Association' column is the sort of action to take on the line item.
# Check if any 'OverDue' items were 'True', and if 'Created' and 'Target' columns have any relationship.
# What were 'Priority' tasks? Any relations to 'Cause, 'Task Group', 'OverDue' or 'Association'?
# Check null values from each column.

Unnamed: 0,Ref,Status,Location,Description,Created,Target,Type,To Package,Status Changed,Association,OverDue,Images,Comments,Documents,Priority,Cause,project,Report Status,Task Group
0,T1.23963030,Open,JPC Project Management>EHS Management>01 Inspe...,task raised in incorrect location of this form...,14/09/2020,,Safety Notice (Amber) - General Issue,Main Contractor,14/09/2020,FormAnswer,False,,,,Behavioural Failure,JPC - Safety - Documentation,1328,Open,Safety
1,T116412.200,Closed,QC & BC(A)R>ITP 02 Architectural & M&E Service...,Metsec,14/09/2020,,JPC - Progress Photo,Ceilings & Partitions,14/09/2020,,False,True,False,False,,,1328,Closed,Site Management
2,T141663.27,EHS Good Observation,JPC Project Management>EHS Management>01 Inspe...,Good clear exclusion zones and access through ...,14/09/2020,,Safety Notice (Green) - Good Observation,Main Contractor,14/09/2020,FormAnswer,False,True,False,False,,JPC - Safety - Access,1328,Closed,Safety
3,T116412.199,Closed,QC & BC(A)R>ITP 02 Architectural & M&E Service...,RC walls,14/09/2020,,JPC - Progress Photo,Precast Concrete,14/09/2020,,False,True,False,False,,,1328,Closed,Site Management
4,T141663.26,EHS Good Observation,JPC Project Management>EHS Management>01 Inspe...,"block 02 working level has good housekeeping, ...",14/09/2020,,Safety Notice (Green) - Good Observation,Precast Concrete,14/09/2020,FormAnswer,False,True,False,False,,JPC - Safety - House Keeping,1328,Closed,Safety
5,T116412.198,Closed,QC & BC(A)R>ITP 02 Architectural & M&E Service...,A3 Roofing,14/09/2020,,JPC - Progress Photo,Roofing,14/09/2020,,False,True,False,False,,,1328,Closed,Site Management
6,T141663.25,EHS Good Observation,JPC Project Management>EHS Management>01 Inspe...,operaiver using hand sanitizer points onsite. ...,14/09/2020,,Safety Notice (Green) - Good Observation,Ceilings & Partitions,14/09/2020,FormAnswer,False,True,False,False,,JPC - Safety - Welfare Facilities,1328,Closed,Safety
7,T116412.197,Closed,QC & BC(A)R>ITP 02 Architectural & M&E Service...,Tyvec install,14/09/2020,,JPC - Progress Photo,Cladding & Roofing,14/09/2020,,False,True,False,False,,,1328,Closed,Site Management
8,T141663.24,EHS Good Observation,JPC Project Management>EHS Management>01 Inspe...,number of operative fully pee compliant,14/09/2020,,Safety Notice (Green) - Good Observation,Main Contractor,14/09/2020,FormAnswer,False,False,False,False,,JPC - Safety - PPE,1328,Closed,Safety
9,T141663.23,Open,JPC Project Management>EHS Management>01 Inspe...,doors to pods removed. to be reviewed to ensur...,14/09/2020,,Safety Notice (Amber) - General Issue,Main Contractor,14/09/2020,FormAnswer,False,True,False,False,,JPC - Safety - Access,1328,Open,Safety


Ref                   0
Status                0
Location              0
Description           0
Created               0
Target             9856
Type                  0
To Package         1042
Status Changed        0
Association        2941
OverDue               0
Images              152
Comments            522
Documents           644
Priority          10058
Cause              2741
project               0
Report Status         0
Task Group           50
dtype: int64