In [None]:
import pandas as pd
import numpy as np
import glob
import os
from methods import get_response_pivot

# Import data frames

First, we start by importing the different csv files and continue by concatenating the files into one dataframe.

In [None]:
def get_data(path, filename_start):
    '''a function to store the content of a directory into a pd dataframe'''
    
    # checking the contents of the directory using the os-module. 
    files = [
        file for file in os.listdir(path) 
        if file.startswith(filename_start)
        ]
    
    print(files)  
    
    # iterate through files and add to the data frame
    all_data = pd.DataFrame()
    for file in files:
        current_data = pd.read_csv(path+"/"+file, dtype={'comments': str})
        all_data = pd.concat([all_data, current_data], ignore_index=True)

    # remove whitespaces from column names
    all_data.columns = all_data.columns.str.replace(' ', '')
        
    print(f'''A dataframe with {all_data.shape[0]} rows and {all_data.shape[1]} columns has been created!\nAll whitespace in column names has been removed.''')
    
    return all_data

## Cities Disclosing / Responses

In [None]:
# add all cities disclosings to a dateframe
df_cities_disc = get_data(
    path='data/Cities/Disclosing',
    filename_start='20'
    )

In [None]:
# add all cities responses to a dateframe
df_cities_resp = get_data(
    path='data/Cities/Responses',
    filename_start='20'
    )

## Corporates Disclosing / Responses

In [None]:
# add all corporates disclosings to a dateframe
df_corp_disc = get_data(
    path='data/Corporations/Disclosing',
    filename_start='20'
    )

In [None]:
# add all corporates  responses to a dateframe
df_corp_resp = get_data(
    path='data/Corporations/Responses',
    filename_start='20'
    )

# Dataframe Description

## Cities

Below is a description of the columns for the **City Disclosures**:

**Year Reported to CDP** : Cities Disclosure cycle survey year  

**Account Number** : The unique identifier given to every city organisation that receives a request to complete a CDP questionnaire  

**Organization**: Name of the City organisation disclosing  

**City**: Name of the City the city organisation is disclosing on behalf of  

**Country**: Country of city  

**CDP Region**: CDP operation region City is located within  

**Reporting Authority**: "CDP collects information on behalf of a number of additional initiatives. Other than CDP Cities, organisations can indicate the additional initiatve they are have answered questions for ","C40,CDP Cities,ICLEI - Local Governments for Sustainability","Includes Global Covenant of Mayors for Climate and Energy, ICLEI Green Climate Cities, ICLEI Ecomobility / Ecologistics, C40 Cities Climate Leadership Group"  

**Access**: Cities can submit CDP response in public status or in non public status. Non public responses can only be shared within CDP and between signatory partners. Public responses can be shared beyond CDP City organisations,public  

**First Time Discloser**: Is the City disclosing for the first time to CDP  

**Population**: Citiy population estimate  

**Population Year**: City population estimate year  

**City Location**: "Citty location cordinates by longitude, latitide"  

**Last update**: Resonse record last update  

*Below is a description of the column types for **City Responses**:*  


**Questionnaire** : Questionnaire and questionnaire year the company's response relates to

**Year Reported to CDP** : Cities Disclosure cycle survey year  

**Account Number** : The unique identifier given to every city organisation that receives a request to complete a CDP questionnaire  

**Organization** : Name of the City organisation disclosing  

**Country** : Country of city    

**CDP Region** : CDP operation region City is located within  

**Parent Section** : Module ('Parent Section') of the CDP questionnaire the question belongs to (e.g. Emissions Reduction)  

**Section** : Section of the CDP questionnaire the question belongs to (e.g.Mitigation Actions)  

**Question Number** : Question number of response (e.g. 5.4) 
 
**Question Name** : Describes the anticipated outcomes of the most impactful mitigation actions your city is currently undertaking; the total cost of the action and how much is being funded by the local government 

**Column Number** : Column number of matrix set (Table) or matrix dynamic (Add Rows Table) column in question  reponse table  

**Column Name** : Column name of matrix set (Table) or matrix dynamic (Add Rows Table) column in question  reponse table,Co-benefit area 

**Row Number** : "Row number of matrix set (Table) or matrix dynamic (Add Rows Table) row in question  reponse table. If originally submitted in a table format, this will indicate the number of rows of response data has been entered in response to a question. ",  

**Row Name** : Row name of matrix set (Table) or matrix dynamic (Add Rows Table) row in question  reponse table. Description of data type for RowNumber where applicable, Population that is food insecure)  

**Response Answer** : Question response submitted by company,Greening the economy,"Can range from string, integar and double data types. Question not applicable = This question was not presented to the company to be answered due to conditional logic in the questionnaire. NA = The company was presented with this question but did not respond"  

## Corporates

Below is a description of the columns for the **Corporate Disclosures**:

**account_number**: The unique identifier given to every company that receives a request to complete a CDP questionnaire.

**organization**: Name of the company disclosing.

**survey_year**: Disclosure cycle survey year. (E.g. survey year 2020 ran from March 2020 - September 2020)

**country**: Country in which the company is incorporated or legally registered.

**region**: CDP operating region in which the company is incorporated or legally registered.

**invitation_status**: CDP invites companies to disclose to the Investor request. If they choose to disclose, they will appear as ""submitted"".

**public**: Companies can submit CDP response in public status or in non public status. Non public responses can only be shared within CDP. Public responses can be shared beyond CDP investor signatories.

**samples**: CDP uses Market Cap from major indices and other environmental factors to help determine who should be requested to respond. Company's are distributed among sample groups to group similar organisations for targetted  invitations to disclose etc.(Continuity, Companies that disclosed the previous year are automatically requested to disclose the following year.)

**response_received_date**: DateTime company response was first received within CDP response systems,2018-08-15T00:00:00Z

**minimum_tier**: Indicates if the highest questionnaire tier a company has responded to. Company's can choose or are requested to submit to a shorter 'Minimum tier' questionnaire or a more in-depth 'Full tier' questionnaire with extended questions. Certain questions are therefore only available in the Full questionnaire.

**selected_tier**: Indicates if the questionnaire tier a company has responded to. Company's can choose or are requested to submit to a shorter 'Minimum tier' questionnaire or a more in-depth 'Full tier' questionnaire with extended questions.  Certain questions are therefore only available in the Full questionnaire. 

**questionnaire**: Questionnaire and questionnaire year the company's response relates to.

**theme**: Questionnaire Theme the company's response relates to.

**authority_types**: Company's can be requested to respond to the CDP questionnaire by either/both CDP investor signatories and CDP Supply Chain members as suppliers that constititute their supply chain operations.

**activities**: CDP Activity Classification System categorizes companies according to their different business streams, revenue and impact on the environment. All  company's potential  business activities based on revenue, within the CDP Activity Classification System (e.g. Aluminium refining, Aluminum, Engines & motors, Fabricated metal components, Other vehicle equipment & systems). 

**sectors**: CDP Activity Classification System categorizes companies according to their different business streams, revenue and impact on the environment. All  company's  potential business sectors based on revenue, within the CDP Activity Classification System (e.g. Metal products manufacturing, Metal smelting, refining & forming, Powered machinery). 

**industries**: CDP Activity Classification System categorizes companies according to their different business streams, revenue and impact on the environment.  All  company's  potential business industries based on revenue, within the CDP Activity Classification System (e.g. Manufacturing, Materials).

**primary_activity**: CDP Activity Classification System categorizes companies according to their different business streams, revenue and impact on the environment. A company's primary business activity based on revenue; the most specific classification of three tiers in the CDP Activity Classification System.

**primary_sector**: CDP Activity Classification System categorizes companies according to their different business streams, revenue and impact on the environment. A company's primary business sector based on revenue; the second most specific classification of three tiers in the CDP Activity Classification System (e.g. Powered machinery)

**primary_industry**: CDP Activity Classification System categorizes companies according to their different business streams, revenue and impact on the environment. A company's primary industry based on revenue; the broadest classification of three tiers in the CDP Activity Classification System.

**primary_questionnaire_sector**: Describes the sector-specific questionnaire that was provided to the company based on their largest activity, if this version of the general questoinnaire was available.

**primary_ticker**: Financial  Market identifier for company.

**tickers**: Market identifiers (if more than one).




Below is a description of the columns for the **Corporate Responses**:

**account_number**:	The unique identifier given to every company that receives a request to complete a CDP questionnaire.

**organization**: Name of the company disclosing.

**survey_year**: Disclosure cycle survey year. 

**response_received_date**:	DateTime company response was first received within CDP response systems.

**accounting_period_to**: Accounting year end for the survey responses provided by the Company.

**ors_response_id**: Response Identifier for all responses belonging to that company and theme.

**submission_date**: DateTime company response was finalised and submitted to CDP with no further amendments.

**page_name**:	Section of the CDP questionnaire the question belongs to.

**module_name**:	Module ('Parent Section') of the CDP questionnaire the question belongs to e.g.Questions.

**question_number**:	Question number of response.

**question_unique_reference**: Question name.

**colmn_number**: Column number of matrix set (Table) or matrix dynamic (Add Rows Table) column in question  reponse table.

**column_name**	Column name of matrix set (Table) or matrix dynamic (Add Rows Table) column in question  reponse table.

**table_columns_unique_reference**:	Column name and number combination modified with '-' seperator from column_name, providing unique column identifer for each question response.

**row_number**:	Row number of matrix set (Table) or matrix dynamic (Add Rows Table) row in question  reponse table. If originally submitted in a table format, this will indicate the number of rows of response data has been entered in response to a question.

**row_name**:	Row name of matrix set (Table) or matrix dynamic (Add Rows Table) row in question  reponse table. Description of data type for RowNumber where applicable (i.e. Scope 3 emissions category).

**data_point_name**:	Question number_Column number_Question Name - Column Name string identifier. Unique reference to questionnumbers in older questionnaires.

**data_point_id**:	Unique identifier for Question Column Response.

**response_value**:	Question response submitted by company.

**comments**:	Added response clarifications from Company or CDP staff.

# Data Content

In [None]:
df_cities_resp.value_counts('Questionnaire')

In [None]:
df_cities_resp.query('QuestionNumber == "14.4" & Questionnaire == "Cities 2020"')

In [None]:
plt.figure(figsize=(12,6))
ax = sns.countplot(x="CDPRegion",palette="viridis", data=df_cities_disc, order=df_cities_disc["CDPRegion"].value_counts().index)
plt.xticks(rotation=70)
plt.title("Survey Participation Distribution per Region",{'fontsize': 12});

In [None]:
def question_to_col(data, QuestionNumber, ColumnNumber):
    '''A simple function to create a new dataframe with all responses to one question.'''
    small_df = pd.DataFrame(columns=[QuestionNumber])
    small_df[QuestionNumber] = data['ResponseAnswer'].loc[(data['QuestionNumber'] == QuestionNumber) & (data['ColumnNumber'] == int(ColumnNumber))]
    return small_df

In [None]:
question_to_col(data=df_cities_resp, QuestionNumber='2.1', ColumnNumber='1')