# Intro to Human-Centered Data Science Final Project Workbook

In [1]:
# import libraries
import os
import pandas as pd

In [2]:
# Set max display cols/rows

# See all columns
pd.set_option('display.max_columns', None)

# See up to 200 rows
pd.set_option('display.max_rows', 200)

## Data Preprocessing

### Reading in the data

In [3]:
# set up imports from google drive
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
# root_dir is the path to your My_Drive folder.
root_dir = "/content/drive/My Drive/"

# i'm storing the data folder in the Colab Notebooks directory at the top level of the Google Drive
data_folder = root_dir + "Colab Notebooks/MPP Science Replication Package/Data/"

# change directory to the data folder
os.chdir(data_folder)

In [5]:
def dta_to_csv(dta_file_path):
  """
  takes a path to a .dta file and converts it to a .csv. The result
  is stored a subdirectory of the original directory named "/csv".

  Arguments:
    dta_file_path: Full path to the .dta file

  """

  # get the directory and file name from the full path
  directory_name, file_name = os.path.split(dta_file_path)

  # read the .dta file into a DataFrame
  print("Converting file", dta_file_path)
  data = pd.io.stata.read_stata(dta_file_path)

  # save the DataFrame as a ".csv" to the "csv" directory in the original path (directory_name).
  data.to_csv(directory_name + "/csv/" +  file_name +  ".csv")
  print(directory_name + "/csv/" +  file_name +  ".csv")

In [6]:
def convert_all_dta_to_csv(data_folder):
  """
  Traverse the directory that has path name data_folder. Look for
  .dta files and call dta_to_csv() to convert them to .csv
  """

  # Walk the directory structure
  for root, direc, files in os.walk(data_folder):
    for file in files:
      file_path = os.path.join(root, file)

      # split the file into its name and extension
      filename, extension = os.path.splitext(file)

      # does the file have a .dta extension
      if extension.lower() == ('.dta'):
        dir_path = root # store current directory

        # Create a 'csv' subdirectory if it doesn't exist
        csv_dir_path = os.path.join(dir_path, 'csv')
        if not os.path.exists(csv_dir_path):
          os.makedirs(csv_dir_path)

        # call the conversion function
        dta_to_csv(file_path)

# call the function to do the conversion
convert_all_dta_to_csv(data_folder)

Converting file /content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/Randomization and heterogeneity.dta
/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/csv/Randomization and heterogeneity.dta.csv
Converting file /content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/CCTV/cctv_baseline data.dta
/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/CCTV/csv/cctv_baseline data.dta.csv
Converting file /content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/CCTV/cctv_full data.dta
/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/CCTV/csv/cctv_full data.dta.csv
Converting file /content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/User survey/user survey_endline data.dta
/content/drive/My Drive/Colab Notebooks/MPP Science Replication Package/Data/User survey/csv/user survey_endline data.dta.csv
Converting file /content/drive

In [7]:
randomization_and_heterogeneity = pd.read_csv(data_folder + "csv/Randomization and heterogeneity.dta.csv")
cctv_baseline = pd.read_csv(data_folder + "CCTV/csv/cctv_baseline data.dta.csv")
cctv_full= pd.read_csv(data_folder + "CCTV/csv/cctv_full data.dta.csv")
user_survey_endline = pd.read_csv(data_folder + "User survey/csv/user survey_endline data.dta.csv")
admin_long = pd.read_csv(data_folder + "Admin/csv/admin_long data.dta.csv")
admin_wide = pd.read_csv(data_folder + "Admin/csv/admin_wide data.dta.csv")
citizen_full = pd.read_csv(data_folder + "Citizen survey/csv/citizen_full data.dta.csv")
citizen_caw_rates = pd.read_csv(data_folder + "Citizen survey/csv/citizen_caw rates.dta.csv")
police_station_personnel = pd.read_csv(data_folder + "Police survey/csv/police_station personnel data.dta.csv")
police_full = pd.read_csv(data_folder + "Police survey/csv/police_full data.dta.csv")
police_baseline = pd.read_csv(data_folder + "Police survey/csv/police_baseline data.dta.csv")

In [8]:
# Check lengths and info of data to confirm data was read in
datasets = [randomization_and_heterogeneity, cctv_baseline, cctv_full, user_survey_endline, admin_long, admin_wide, citizen_full, citizen_caw_rates, police_station_personnel, police_full, police_baseline]
for df in datasets:
  print(len(df))
  print(df.info())
  print("\n\n")

180
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               180 non-null    int64  
 1   total_assigned_officers  180 non-null    int64  
 2   total_fir_2017           180 non-null    int64  
 3   population               180 non-null    int64  
 4   urban                    180 non-null    object 
 5   dist_urban               180 non-null    float64
 6   strat_pca                180 non-null    float64
 7   treatment                180 non-null    object 
 8   group                    180 non-null    object 
 9   dist_id                  180 non-null    float64
 10  implement_quality        119 non-null    float64
 11  training_score           180 non-null    float64
 12  comm_outreach_strength   119 non-null    float64
 13  regular_whd              180 non-null    float64
 14  women_whd             

### Explore and clean police baseline data
Potention Question 1:  Is there a relationship between officer gender and baseline perceptions of police’s role in CAW cases?

Police baseline data main source for this question.

In [9]:
police_baseline.head()

Unnamed: 0.1,Unnamed: 0,b_uid,gender,b_pol_impt,b_thana_impt,b_wcase,b_effective,b_helpful,b_add_officer,b_add_female,b_female_better,b_sensitivity,uid,attrit,ps_code,population,urban,dist_urban,strat_pca,treatment,group,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd
0,0,2240165,m,0.0,0.0,enough attention,very effective,helpful,more effective,much less effective,female,6.0,,1.0,1001.0,129345,Rural,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0
1,1,2240171,m,1.0,1.0,enough attention,effective,helpful,much more effective,less effective,no difference,7.0,,1.0,1001.0,129345,Rural,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0
2,2,2240167,m,1.0,1.0,too much attention,very effective,helpful,more effective,much less effective,female,6.0,2240139.0,0.0,1001.0,129345,Rural,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0
3,3,2240172,m,0.0,0.0,too much attention,effective,helpful,much more effective,less effective,female,8.0,2240138.0,0.0,1001.0,129345,Rural,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0
4,4,2240169,m,0.0,0.0,too much attention,effective,helpful,much more effective,much less effective,no difference,7.0,2240142.0,0.0,1001.0,129345,Rural,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0


In [10]:
# drop unnamed index column
# keep all other columns for now, unsure what will be relevant for analysis
police_baseline = police_baseline.drop(police_baseline.columns[0], axis=1)
police_baseline.head(1)

Unnamed: 0,b_uid,gender,b_pol_impt,b_thana_impt,b_wcase,b_effective,b_helpful,b_add_officer,b_add_female,b_female_better,b_sensitivity,uid,attrit,ps_code,population,urban,dist_urban,strat_pca,treatment,group,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd
0,2240165,m,0.0,0.0,enough attention,very effective,helpful,more effective,much less effective,female,6.0,,1.0,1001.0,129345,Rural,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0


In [11]:
# check data types of columns
display(police_baseline.info())

# look at describe, see which numerical values are actually binary
display(police_baseline.describe())
display(police_baseline.describe(include='object'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1950 entries, 0 to 1949
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   b_uid                   1950 non-null   int64  
 1   gender                  1950 non-null   object 
 2   b_pol_impt              1918 non-null   float64
 3   b_thana_impt            1924 non-null   float64
 4   b_wcase                 1948 non-null   object 
 5   b_effective             1943 non-null   object 
 6   b_helpful               1947 non-null   object 
 7   b_add_officer           1948 non-null   object 
 8   b_add_female            1949 non-null   object 
 9   b_female_better         1947 non-null   object 
 10  b_sensitivity           1945 non-null   float64
 11  uid                     1139 non-null   float64
 12  attrit                  1950 non-null   float64
 13  ps_code                 1950 non-null   float64
 14  population              1950 non-null   

None

Unnamed: 0,b_uid,b_pol_impt,b_thana_impt,b_sensitivity,uid,attrit,ps_code,population,dist_urban,strat_pca,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd
count,1950.0,1918.0,1924.0,1945.0,1139.0,1950.0,1950.0,1950.0,1950.0,1950.0,1950.0,1295.0,1950.0,1295.0,1950.0,1950.0
mean,2621709.0,0.383212,0.398129,7.03856,2601024.0,0.415897,5824.591795,129263.011795,8.540513,0.000986,5734.871795,7.453282,2.102163,1.795336,0.343077,0.326154
std,309352.0,0.486296,0.48964,2.45181,297854.8,0.493002,3132.057108,84928.087029,4.92969,1.371831,3081.077214,1.374003,1.23843,0.749557,0.474859,0.468925
min,2141761.0,0.0,0.0,1.0,2141733.0,0.0,1001.0,23565.0,1.0,-2.355807,1000.0,1.0,0.0,0.0,0.0,0.0
25%,2356266.0,0.0,0.0,5.0,2353640.0,0.0,3044.0,76376.0,4.0,-1.104892,3000.0,7.0,1.181818,1.2,0.0,0.0
50%,2553966.0,0.0,0.0,7.0,2551635.0,0.0,5089.0,111000.0,8.0,-0.130051,5000.0,8.0,2.166667,2.1,0.0,0.0
75%,2847365.0,1.0,1.0,8.0,2753742.0,1.0,8136.0,151916.0,12.0,0.997372,8000.0,8.0,3.311267,2.19,1.0,1.0
max,3257271.0,1.0,1.0,16.0,3257341.0,1.0,12180.0,545000.0,18.0,3.986043,12000.0,9.0,4.583334,3.33,1.0,1.0


Unnamed: 0,gender,b_wcase,b_effective,b_helpful,b_add_officer,b_add_female,b_female_better,urban,treatment,group
count,1950,1948,1943,1947,1948,1949,1947,1950,1950,1950
unique,2,3,5,5,5,5,3,2,2,3
top,m,too much attention,very effective,very helpful,much more effective,much less effective,female,Rural,Treatment,regular mhd
freq,1713,1328,1210,1211,969,1185,1541,1105,1305,669


In [12]:
# Change data types for necessary columns

# convert ids from numerical to string, make sure no decimals in text: b_uid, uid, ps_code, dist_id
police_baseline[['b_uid', 'uid', 'ps_code', 'dist_id']] = police_baseline[['b_uid', 'uid', 'ps_code', 'dist_id']].astype('Int64').astype('str')

# convert binary indicators to boolean: b_pol_impt, b_thana_impt, attrit, regular_whd, women_whd
police_baseline[['b_pol_impt', 'b_thana_impt', 'attrit', 'regular_whd', 'women_whd']] = police_baseline[['b_pol_impt', 'b_thana_impt', 'attrit', 'regular_whd', 'women_whd']].astype('bool')

# convert columns with a set option of values to category: dist_urban, gender, b_wcase, b_effective, b_helpful, b_add_officer, b_add_female, b_female_better, urban, treatment, group
police_baseline['dist_urban'] = police_baseline['dist_urban'].astype('category')
police_baseline[['gender', 'b_wcase', 'b_effective', 'b_helpful', 'b_add_officer', 'b_add_female', 'b_female_better', 'urban', 'treatment', 'group']] = police_baseline[['gender', 'b_wcase', 'b_effective', 'b_helpful', 'b_add_officer', 'b_add_female', 'b_female_better', 'urban', 'treatment', 'group']].astype('category')


In [13]:
# check data types now
display(police_baseline.info())
display(police_baseline.describe(include='category'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1950 entries, 0 to 1949
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   b_uid                   1950 non-null   object  
 1   gender                  1950 non-null   category
 2   b_pol_impt              1950 non-null   bool    
 3   b_thana_impt            1950 non-null   bool    
 4   b_wcase                 1948 non-null   category
 5   b_effective             1943 non-null   category
 6   b_helpful               1947 non-null   category
 7   b_add_officer           1948 non-null   category
 8   b_add_female            1949 non-null   category
 9   b_female_better         1947 non-null   category
 10  b_sensitivity           1945 non-null   float64 
 11  uid                     1950 non-null   object  
 12  attrit                  1950 non-null   bool    
 13  ps_code                 1950 non-null   object  
 14  population              

None

Unnamed: 0,gender,b_wcase,b_effective,b_helpful,b_add_officer,b_add_female,b_female_better,urban,dist_urban,treatment,group
count,1950,1948,1943,1947,1948,1949,1947,1950,1950.0,1950,1950
unique,2,3,5,5,5,5,3,2,18.0,2,3
top,m,too much attention,very effective,very helpful,much more effective,much less effective,female,Rural,4.0,Treatment,regular mhd
freq,1713,1328,1210,1211,969,1185,1541,1105,249.0,1305,669


In [14]:
# Create dummy indicators for the categorical values
police_baseline_dummies = pd.get_dummies(police_baseline[['b_wcase', 'b_effective', 'b_helpful', 'b_add_officer', 'b_add_female', 'b_female_better', 'urban', 'treatment']], prefix=['b_wcase', 'b_effective', 'b_helpful', 'b_add_officer', 'b_add_female', 'b_female_better', 'urban', 'treatment'], drop_first=True)
police_baseline_dummies.head()

Unnamed: 0,b_wcase_too little attention,b_wcase_too much attention,b_effective_ineffective,b_effective_neither effective nor ineffective,b_effective_very effective,b_effective_very ineffective,b_helpful_neither helpful nor unhelpful,b_helpful_unhelpful,b_helpful_very helpful,b_helpful_very unhelpful,b_add_officer_more effective,b_add_officer_much less effective,b_add_officer_much more effective,b_add_officer_no difference,b_add_female_more effective,b_add_female_much less effective,b_add_female_much more effective,b_add_female_no difference,b_female_better_male,b_female_better_no difference,urban_Urban,treatment_Treatment
0,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,True
2,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True
3,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True
4,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,True,False,True


In [15]:
# concat dummy columns
police_baseline_all = pd.concat([police_baseline, police_baseline_dummies], axis=1)
display(police_baseline_all.head())
display(police_baseline_all.info())

Unnamed: 0,b_uid,gender,b_pol_impt,b_thana_impt,b_wcase,b_effective,b_helpful,b_add_officer,b_add_female,b_female_better,b_sensitivity,uid,attrit,ps_code,population,urban,dist_urban,strat_pca,treatment,group,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd,b_wcase_too little attention,b_wcase_too much attention,b_effective_ineffective,b_effective_neither effective nor ineffective,b_effective_very effective,b_effective_very ineffective,b_helpful_neither helpful nor unhelpful,b_helpful_unhelpful,b_helpful_very helpful,b_helpful_very unhelpful,b_add_officer_more effective,b_add_officer_much less effective,b_add_officer_much more effective,b_add_officer_no difference,b_add_female_more effective,b_add_female_much less effective,b_add_female_much more effective,b_add_female_no difference,b_female_better_male,b_female_better_no difference,urban_Urban,treatment_Treatment
0,2240165,m,False,False,enough attention,very effective,helpful,more effective,much less effective,female,6.0,,True,1001,129345,Rural,1.0,-0.234041,Treatment,women officers,1000,9.0,2.916667,2.2,False,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True
1,2240171,m,True,True,enough attention,effective,helpful,much more effective,less effective,no difference,7.0,,True,1001,129345,Rural,1.0,-0.234041,Treatment,women officers,1000,9.0,2.916667,2.2,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,True
2,2240167,m,True,True,too much attention,very effective,helpful,more effective,much less effective,female,6.0,2240139.0,False,1001,129345,Rural,1.0,-0.234041,Treatment,women officers,1000,9.0,2.916667,2.2,False,True,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True
3,2240172,m,False,False,too much attention,effective,helpful,much more effective,less effective,female,8.0,2240138.0,False,1001,129345,Rural,1.0,-0.234041,Treatment,women officers,1000,9.0,2.916667,2.2,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True
4,2240169,m,False,False,too much attention,effective,helpful,much more effective,much less effective,no difference,7.0,2240142.0,False,1001,129345,Rural,1.0,-0.234041,Treatment,women officers,1000,9.0,2.916667,2.2,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,True,False,True


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1950 entries, 0 to 1949
Data columns (total 48 columns):
 #   Column                                         Non-Null Count  Dtype   
---  ------                                         --------------  -----   
 0   b_uid                                          1950 non-null   object  
 1   gender                                         1950 non-null   category
 2   b_pol_impt                                     1950 non-null   bool    
 3   b_thana_impt                                   1950 non-null   bool    
 4   b_wcase                                        1948 non-null   category
 5   b_effective                                    1943 non-null   category
 6   b_helpful                                      1947 non-null   category
 7   b_add_officer                                  1948 non-null   category
 8   b_add_female                                   1949 non-null   category
 9   b_female_better                          

None

### Explore and clean police full data

Potention Question 2: Does having a WHD impact the officers’ perceptions of police’s role in CAW cases?

Police full data main source for this question.

In [16]:
police_full.head()

Unnamed: 0.1,Unnamed: 0,uid,gender,e_wcase,e_effective,e_false_case,e_helpful,e_add_officer,e_add_female,e_female_better,e_taken_seriously,e_prof_dev,e_work_help,e_pol_impt,e_thana_impt,e_sensitivity,ps_code,population,urban,dist_urban,strat_pca,treatment,group,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd,b_wcase,b_effective,b_pol_impt,b_thana_impt,b_sensitivity,b_helpful,b_female_better,b_add_officer,b_add_female
0,0,2240137,male,too much attention,very effective,common,very helpful,more effective,Less effective,female,disagree,disagree,disagree,1.0,0.0,9.0,1001.0,129345,Rural,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0,2.7,4.5,0.5,0.4,7.3,4.2,2.7,4.5,4.6
1,1,2240133,male,too much attention,very effective,common,very helpful,much more effective,No difference,female,strongly agree,strongly agree,disagree,1.0,1.0,9.0,1001.0,129345,Rural,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0,2.7,4.5,0.5,0.4,7.3,4.2,2.7,4.5,4.6
2,2,2240136,male,too much attention,very effective,uncommon,very helpful,much more effective,Less effective,female,disagree,disagree,strongly disagree,0.0,0.0,12.0,1001.0,129345,Rural,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0,2.7,4.5,0.5,0.4,7.3,4.2,2.7,4.5,4.6
3,3,2240132,female,too much attention,very effective,common,helpful,much more effective,Less effective,female,agree,agree,disagree,1.0,1.0,9.0,1001.0,129345,Rural,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0,2.7,4.5,0.5,0.4,7.3,4.2,2.7,4.5,4.6
4,4,2240134,male,too much attention,very effective,very common,very helpful,much more effective,No difference,female,agree,disagree,disagree,1.0,1.0,7.0,1001.0,129345,Rural,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0,2.7,4.5,0.5,0.4,7.3,4.2,2.7,4.5,4.6


In [17]:
# drop unnamed index column
police_full = police_full.drop(police_full.columns[0], axis=1)
police_full.head(1)

Unnamed: 0,uid,gender,e_wcase,e_effective,e_false_case,e_helpful,e_add_officer,e_add_female,e_female_better,e_taken_seriously,e_prof_dev,e_work_help,e_pol_impt,e_thana_impt,e_sensitivity,ps_code,population,urban,dist_urban,strat_pca,treatment,group,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd,b_wcase,b_effective,b_pol_impt,b_thana_impt,b_sensitivity,b_helpful,b_female_better,b_add_officer,b_add_female
0,2240137,male,too much attention,very effective,common,very helpful,more effective,Less effective,female,disagree,disagree,disagree,1.0,0.0,9.0,1001.0,129345,Rural,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0,2.7,4.5,0.5,0.4,7.3,4.2,2.7,4.5,4.6


In [18]:
# check data types of columns
display(police_full.info())

# look at describe, see which numerical values are actually binary
display(police_full.describe())
display(police_full.describe(include='object'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Data columns (total 37 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uid                     1961 non-null   int64  
 1   gender                  1961 non-null   object 
 2   e_wcase                 1948 non-null   object 
 3   e_effective             1959 non-null   object 
 4   e_false_case            1932 non-null   object 
 5   e_helpful               1958 non-null   object 
 6   e_add_officer           1955 non-null   object 
 7   e_add_female            1956 non-null   object 
 8   e_female_better         1956 non-null   object 
 9   e_taken_seriously       1951 non-null   object 
 10  e_prof_dev              1951 non-null   object 
 11  e_work_help             1954 non-null   object 
 12  e_pol_impt              1961 non-null   float64
 13  e_thana_impt            1961 non-null   float64
 14  e_sensitivity           1961 non-null   

None

Unnamed: 0,uid,e_pol_impt,e_thana_impt,e_sensitivity,ps_code,population,dist_urban,strat_pca,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd,b_wcase,b_effective,b_pol_impt,b_thana_impt,b_sensitivity,b_helpful,b_female_better,b_add_officer,b_add_female
count,1961.0,1961.0,1961.0,1961.0,1961.0,1961.0,1961.0,1961.0,1961.0,1295.0,1961.0,1295.0,1961.0,1961.0,1961.0,1961.0,1961.0,1961.0,1961.0,1961.0,1961.0,1961.0,1961.0
mean,2621999.0,0.388577,0.45079,7.54411,5831.550739,130111.211117,8.565528,0.013613,5741.45844,7.488031,2.098749,1.810286,0.337073,0.328914,2.682977,4.611181,0.378731,0.395461,7.044323,4.619109,2.718035,4.428698,4.52998
std,307310.9,0.487551,0.497699,2.547752,3104.927667,86078.785966,4.890319,1.374626,3054.431327,1.346346,1.232393,0.743555,0.472831,0.469938,0.369369,0.436028,0.374259,0.376009,2.050444,0.417076,0.452414,0.497976,0.527032
min,2141731.0,0.0,0.0,0.0,1001.0,23565.0,1.0,-2.355807,1000.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
25%,2357536.0,0.0,0.0,6.0,3046.0,76376.0,4.0,-1.126372,3000.0,7.0,1.181818,1.2,0.0,0.0,2.5,4.363637,0.0,0.0,6.0,4.333334,2.636364,4.0,4.166666
50%,2554033.0,0.0,0.0,7.0,5090.0,111000.0,8.0,-0.120759,5000.0,8.0,2.166667,2.1,0.0,0.0,2.8,4.714286,0.307692,0.363636,7.0,4.75,2.923077,4.444445,4.636363
75%,2845339.0,1.0,1.0,9.0,8135.0,152000.0,12.0,0.997372,8000.0,8.0,3.311267,2.2,1.0,1.0,3.0,5.0,0.6,0.636364,8.0,5.0,3.0,5.0,5.0
max,3257341.0,1.0,1.0,19.0,12180.0,545000.0,18.0,3.986043,12000.0,9.0,4.583334,3.33,1.0,1.0,3.0,5.0,1.0,1.0,16.0,5.0,3.0,5.0,5.0


Unnamed: 0,gender,e_wcase,e_effective,e_false_case,e_helpful,e_add_officer,e_add_female,e_female_better,e_taken_seriously,e_prof_dev,e_work_help,urban,treatment,group
count,1961,1948,1959,1932,1958,1955,1956,1956,1951,1951,1954,1961,1961,1961
unique,2,3,4,4,4,3,4,3,4,4,4,2,2,3
top,male,too much attention,very effective,uncommon,very helpful,much more effective,Less effective,female,agree,agree,agree,Rural,Treatment,regular mhd
freq,1730,1446,1174,649,1314,1278,1269,1486,1341,1346,870,1098,1306,661


Interesting finding: baseline variables are numeric in full data- endline variables are categorical as baseline variables are in the baseline dataset

Why? What is the conversion? They are not all integers so that tells me they may be aggregated to some level then joined back on (ex: mean at district level joined back onto individual level full data) - perhaps for imputation of missing values

Will explore during EDA - for now, will keep float columns and create version of data that merges on baseline values using UID

In [19]:
# Change data types for necessary columns

# convert ids from numerical to string: uid, ps_code, dist_id
police_full[['uid', 'ps_code', 'dist_id']] = police_full[['uid', 'ps_code', 'dist_id']].astype('Int64').astype('str')

# convert binary indicators to boolean: b_pol_impt, b_thana_impt, e_pol_impt, e_thana_impt, regular_whd, women_whd
police_full[['b_pol_impt', 'b_thana_impt', 'e_pol_impt', 'e_thana_impt', 'regular_whd', 'women_whd']] = police_full[['b_pol_impt', 'b_thana_impt', 'e_pol_impt', 'e_thana_impt', 'regular_whd', 'women_whd']].astype('bool')

# convert columns with a set option of values to category: dist_urban, gender, b_wcase, b_effective, b_helpful, b_add_officer, b_add_female, b_female_better, urban, treatment, group
police_full['dist_urban'] = police_full['dist_urban'].astype('category')
police_full[['gender', 'e_wcase', 'e_effective', 'e_false_case', 'e_taken_seriously', 'e_prof_dev', 'e_work_help', 'e_helpful', 'e_add_officer', 'e_add_female', 'e_female_better', 'urban', 'treatment', 'group']] = police_full[['gender', 'e_wcase', 'e_effective', 'e_false_case', 'e_taken_seriously', 'e_prof_dev', 'e_work_help', 'e_helpful', 'e_add_officer', 'e_add_female', 'e_female_better', 'urban', 'treatment', 'group']] .astype('category')


In [20]:
# check data types now
display(police_full.info())
display(police_full.describe(include='category'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Data columns (total 37 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   uid                     1961 non-null   object  
 1   gender                  1961 non-null   category
 2   e_wcase                 1948 non-null   category
 3   e_effective             1959 non-null   category
 4   e_false_case            1932 non-null   category
 5   e_helpful               1958 non-null   category
 6   e_add_officer           1955 non-null   category
 7   e_add_female            1956 non-null   category
 8   e_female_better         1956 non-null   category
 9   e_taken_seriously       1951 non-null   category
 10  e_prof_dev              1951 non-null   category
 11  e_work_help             1954 non-null   category
 12  e_pol_impt              1961 non-null   bool    
 13  e_thana_impt            1961 non-null   bool    
 14  e_sensitivity           

None

Unnamed: 0,gender,e_wcase,e_effective,e_false_case,e_helpful,e_add_officer,e_add_female,e_female_better,e_taken_seriously,e_prof_dev,e_work_help,urban,dist_urban,treatment,group
count,1961,1948,1959,1932,1958,1955,1956,1956,1951,1951,1954,1961,1961.0,1961,1961
unique,2,3,4,4,4,3,4,3,4,4,4,2,18.0,2,3
top,male,too much attention,very effective,uncommon,very helpful,much more effective,Less effective,female,agree,agree,agree,Rural,4.0,Treatment,regular mhd
freq,1730,1446,1174,649,1314,1278,1269,1486,1341,1346,870,1098,232.0,1306,661


In [21]:
# Create dummy indicators for the categorical values, may be useful for comparisons, creating change from baseline to endline variables, etc
police_full_dummies = pd.get_dummies(police_full[['e_wcase', 'e_effective', 'e_false_case', 'e_taken_seriously', 'e_prof_dev', 'e_work_help', 'e_helpful', 'e_add_officer', 'e_add_female', 'e_female_better', 'urban', 'treatment']], prefix=['e_wcase', 'e_effective', 'e_false_case', 'e_taken_seriously', 'e_prof_dev', 'e_work_help', 'e_helpful', 'e_add_officer', 'e_add_female', 'e_female_better', 'urban', 'treatment'], drop_first=True)
police_full_dummies.head()

Unnamed: 0,e_wcase_too little attention,e_wcase_too much attention,e_effective_ineffective,e_effective_very effective,e_effective_very ineffective,e_false_case_uncommon,e_false_case_very common,e_false_case_very uncommon,e_taken_seriously_disagree,e_taken_seriously_strongly agree,e_taken_seriously_strongly disagree,e_prof_dev_disagree,e_prof_dev_strongly agree,e_prof_dev_strongly disagree,e_work_help_disagree,e_work_help_strongly agree,e_work_help_strongly disagree,e_helpful_unhelpful,e_helpful_very helpful,e_helpful_very unhelpful,e_add_officer_more effective,e_add_officer_much more effective,e_add_female_More Effective,e_add_female_Much more effective,e_add_female_No difference,e_female_better_male,e_female_better_no difference,urban_Urban,treatment_Treatment
0,False,True,False,True,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False,False,False,False,False,False,True
1,False,True,False,True,False,False,False,False,False,True,False,False,True,False,True,False,False,False,True,False,False,True,False,False,True,False,False,False,True
2,False,True,False,True,False,True,False,False,True,False,False,True,False,False,False,False,True,False,True,False,False,True,False,False,False,False,False,False,True
3,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True
4,False,True,False,True,False,False,True,False,False,False,False,True,False,False,True,False,False,False,True,False,False,True,False,False,True,False,False,False,True


In [22]:
# concat dummy columns
police_full_all = pd.concat([police_full, police_full_dummies], axis=1)
display(police_full_all.head())
display(police_full_all.info())

Unnamed: 0,uid,gender,e_wcase,e_effective,e_false_case,e_helpful,e_add_officer,e_add_female,e_female_better,e_taken_seriously,e_prof_dev,e_work_help,e_pol_impt,e_thana_impt,e_sensitivity,ps_code,population,urban,dist_urban,strat_pca,treatment,group,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd,b_wcase,b_effective,b_pol_impt,b_thana_impt,b_sensitivity,b_helpful,b_female_better,b_add_officer,b_add_female,e_wcase_too little attention,e_wcase_too much attention,e_effective_ineffective,e_effective_very effective,e_effective_very ineffective,e_false_case_uncommon,e_false_case_very common,e_false_case_very uncommon,e_taken_seriously_disagree,e_taken_seriously_strongly agree,e_taken_seriously_strongly disagree,e_prof_dev_disagree,e_prof_dev_strongly agree,e_prof_dev_strongly disagree,e_work_help_disagree,e_work_help_strongly agree,e_work_help_strongly disagree,e_helpful_unhelpful,e_helpful_very helpful,e_helpful_very unhelpful,e_add_officer_more effective,e_add_officer_much more effective,e_add_female_More Effective,e_add_female_Much more effective,e_add_female_No difference,e_female_better_male,e_female_better_no difference,urban_Urban,treatment_Treatment
0,2240137,male,too much attention,very effective,common,very helpful,more effective,Less effective,female,disagree,disagree,disagree,True,False,9.0,1001,129345,Rural,1.0,-0.234041,Treatment,women officers,1000,9.0,2.916667,2.2,False,True,2.7,4.5,True,True,7.3,4.2,2.7,4.5,4.6,False,True,False,True,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False,False,False,False,False,False,True
1,2240133,male,too much attention,very effective,common,very helpful,much more effective,No difference,female,strongly agree,strongly agree,disagree,True,True,9.0,1001,129345,Rural,1.0,-0.234041,Treatment,women officers,1000,9.0,2.916667,2.2,False,True,2.7,4.5,True,True,7.3,4.2,2.7,4.5,4.6,False,True,False,True,False,False,False,False,False,True,False,False,True,False,True,False,False,False,True,False,False,True,False,False,True,False,False,False,True
2,2240136,male,too much attention,very effective,uncommon,very helpful,much more effective,Less effective,female,disagree,disagree,strongly disagree,False,False,12.0,1001,129345,Rural,1.0,-0.234041,Treatment,women officers,1000,9.0,2.916667,2.2,False,True,2.7,4.5,True,True,7.3,4.2,2.7,4.5,4.6,False,True,False,True,False,True,False,False,True,False,False,True,False,False,False,False,True,False,True,False,False,True,False,False,False,False,False,False,True
3,2240132,female,too much attention,very effective,common,helpful,much more effective,Less effective,female,agree,agree,disagree,True,True,9.0,1001,129345,Rural,1.0,-0.234041,Treatment,women officers,1000,9.0,2.916667,2.2,False,True,2.7,4.5,True,True,7.3,4.2,2.7,4.5,4.6,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True
4,2240134,male,too much attention,very effective,very common,very helpful,much more effective,No difference,female,agree,disagree,disagree,True,True,7.0,1001,129345,Rural,1.0,-0.234041,Treatment,women officers,1000,9.0,2.916667,2.2,False,True,2.7,4.5,True,True,7.3,4.2,2.7,4.5,4.6,False,True,False,True,False,False,True,False,False,False,False,True,False,False,True,False,False,False,True,False,False,True,False,False,True,False,False,False,True


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Data columns (total 66 columns):
 #   Column                               Non-Null Count  Dtype   
---  ------                               --------------  -----   
 0   uid                                  1961 non-null   object  
 1   gender                               1961 non-null   category
 2   e_wcase                              1948 non-null   category
 3   e_effective                          1959 non-null   category
 4   e_false_case                         1932 non-null   category
 5   e_helpful                            1958 non-null   category
 6   e_add_officer                        1955 non-null   category
 7   e_add_female                         1956 non-null   category
 8   e_female_better                      1956 non-null   category
 9   e_taken_seriously                    1951 non-null   category
 10  e_prof_dev                           1951 non-null   category
 11  e_work_help      

None

### Merge full and baseline data on uid

Get just data with actual baseline data, original baseline values not converted or imputed

In [23]:
# Select and rename columns from baseline data we want to merge
police_baseline_merge_cols = [col for col in police_baseline_all if col.startswith("b_")]
police_baseline_merge_cols.append('uid')
police_baseline_merge_names = ['orig_' + col for col in police_baseline_merge_cols]

#Initialize dataframe
police_baseline_merge_df = pd.DataFrame()

#Create dataframe for merge
police_baseline_merge_df[police_baseline_merge_names] = police_baseline_all[police_baseline_merge_cols]
police_baseline_merge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1950 entries, 0 to 1949
Data columns (total 31 columns):
 #   Column                                              Non-Null Count  Dtype   
---  ------                                              --------------  -----   
 0   orig_b_uid                                          1950 non-null   object  
 1   orig_b_pol_impt                                     1950 non-null   bool    
 2   orig_b_thana_impt                                   1950 non-null   bool    
 3   orig_b_wcase                                        1948 non-null   category
 4   orig_b_effective                                    1943 non-null   category
 5   orig_b_helpful                                      1947 non-null   category
 6   orig_b_add_officer                                  1948 non-null   category
 7   orig_b_add_female                                   1949 non-null   category
 8   orig_b_female_better                                1947 non-null   

In [24]:
# merge full and baseline dataframes
# inner join as we only want data with both baseline and endline observations
print("Full data length:", len(police_full_all))
print("Baseline data length:", len(police_baseline_all))
police_merged_df = pd.merge(police_full_all, police_baseline_merge_df, how='inner', left_on = 'uid', right_on = 'orig_uid')
print("Merged data length:", len(police_merged_df))

Full data length: 1961
Baseline data length: 1950
Merged data length: 1068


In [25]:
# Look at merged data
display(police_merged_df.head())
display(police_merged_df.describe())

Unnamed: 0,uid,gender,e_wcase,e_effective,e_false_case,e_helpful,e_add_officer,e_add_female,e_female_better,e_taken_seriously,e_prof_dev,e_work_help,e_pol_impt,e_thana_impt,e_sensitivity,ps_code,population,urban,dist_urban,strat_pca,treatment,group,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd,b_wcase,b_effective,b_pol_impt,b_thana_impt,b_sensitivity,b_helpful,b_female_better,b_add_officer,b_add_female,e_wcase_too little attention,e_wcase_too much attention,e_effective_ineffective,e_effective_very effective,e_effective_very ineffective,e_false_case_uncommon,e_false_case_very common,e_false_case_very uncommon,e_taken_seriously_disagree,e_taken_seriously_strongly agree,e_taken_seriously_strongly disagree,e_prof_dev_disagree,e_prof_dev_strongly agree,e_prof_dev_strongly disagree,e_work_help_disagree,e_work_help_strongly agree,e_work_help_strongly disagree,e_helpful_unhelpful,e_helpful_very helpful,e_helpful_very unhelpful,e_add_officer_more effective,e_add_officer_much more effective,e_add_female_More Effective,e_add_female_Much more effective,e_add_female_No difference,e_female_better_male,e_female_better_no difference,urban_Urban,treatment_Treatment,orig_b_uid,orig_b_pol_impt,orig_b_thana_impt,orig_b_wcase,orig_b_effective,orig_b_helpful,orig_b_add_officer,orig_b_add_female,orig_b_female_better,orig_b_sensitivity,orig_b_wcase_too little attention,orig_b_wcase_too much attention,orig_b_effective_ineffective,orig_b_effective_neither effective nor ineffective,orig_b_effective_very effective,orig_b_effective_very ineffective,orig_b_helpful_neither helpful nor unhelpful,orig_b_helpful_unhelpful,orig_b_helpful_very helpful,orig_b_helpful_very unhelpful,orig_b_add_officer_more effective,orig_b_add_officer_much less effective,orig_b_add_officer_much more effective,orig_b_add_officer_no difference,orig_b_add_female_more effective,orig_b_add_female_much less effective,orig_b_add_female_much more effective,orig_b_add_female_no difference,orig_b_female_better_male,orig_b_female_better_no difference,orig_uid
0,2141733,male,too much attention,very effective,uncommon,very helpful,much more effective,Less effective,female,strongly agree,agree,agree,True,True,8.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,2.0,5.0,False,False,8.0,4.0,3.0,4.0,5.0,False,True,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,2141762,False,False,enough attention,very effective,helpful,more effective,much less effective,female,8.0,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,2141733
1,2141734,male,too much attention,effective,very uncommon,helpful,much more effective,Less effective,female,agree,strongly agree,agree,True,True,9.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,3.0,5.0,False,False,8.0,5.0,3.0,5.0,5.0,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,2141764,False,False,too much attention,very effective,very helpful,much more effective,much less effective,female,8.0,False,True,False,False,True,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,2141734
2,2141739,male,too much attention,very effective,very uncommon,very helpful,more effective,Less effective,female,agree,agree,agree,False,False,7.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,2.0,4.0,False,False,5.0,4.0,1.0,4.0,5.0,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,2141765,False,False,enough attention,effective,helpful,more effective,much less effective,male,5.0,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False,2141739
3,2141742,male,too much attention,very effective,uncommon,helpful,much more effective,No difference,no difference,agree,agree,agree,True,False,5.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,3.0,5.0,False,False,8.0,4.0,1.0,4.0,5.0,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,True,2141766,False,False,too much attention,very effective,helpful,more effective,much less effective,male,8.0,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False,2141742
4,2141740,male,enough attention,effective,uncommon,very helpful,much more effective,Less effective,female,strongly agree,strongly agree,strongly agree,False,True,7.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,3.0,5.0,False,False,6.0,4.0,3.0,5.0,4.0,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,True,2141767,False,False,too much attention,very effective,helpful,much more effective,less effective,female,6.0,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,2141740


Unnamed: 0,e_sensitivity,population,strat_pca,implement_quality,training_score,comm_outreach_strength,b_wcase,b_effective,b_sensitivity,b_helpful,b_female_better,b_add_officer,b_add_female,orig_b_sensitivity
count,1068.0,1068.0,1068.0,722.0,1068.0,722.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1064.0
mean,7.487828,133581.329588,0.090484,7.414127,2.10842,1.781371,2.698876,4.627637,7.096953,4.629838,2.734691,4.42603,4.542135,7.099624
std,2.510492,84697.892874,1.375452,1.357762,1.220826,0.75652,0.46882,0.556589,2.456894,0.520231,0.574283,0.62975,0.66588,2.460877
min,0.0,23565.0,-2.355807,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,6.0,80000.0,-1.054139,7.0,1.218532,1.2,2.0,4.0,5.0,4.0,3.0,4.0,4.0,5.0
50%,7.0,112937.0,0.018645,8.0,2.181818,2.1,3.0,5.0,7.0,5.0,3.0,4.0,5.0,7.0
75%,9.0,156626.0,1.067716,8.0,3.25974,2.18,3.0,5.0,8.0,5.0,3.0,5.0,5.0,8.0
max,19.0,545000.0,3.986043,9.0,4.583334,3.33,3.0,5.0,16.0,5.0,3.0,5.0,5.0,16.0


Baseline values are integers in the merged data - supports theory that the non-integer values in full data were due to imputation of missing values

In [26]:
# check which variables have nulls/how many
police_merged_df.isnull().sum()

# implement_quality and comm_outreach_strength both have 346 missing values
# these are station level variables - check station level data

Unnamed: 0,0
uid,0
gender,0
e_wcase,7
e_effective,1
e_false_case,17
e_helpful,1
e_add_officer,4
e_add_female,3
e_female_better,2
e_taken_seriously,5


In [27]:
display(randomization_and_heterogeneity.head())
display(randomization_and_heterogeneity.isnull().sum())
display(randomization_and_heterogeneity['treatment'].value_counts())
display(randomization_and_heterogeneity[randomization_and_heterogeneity['treatment']=='Treatment'].isnull().sum())

Unnamed: 0.1,Unnamed: 0,total_assigned_officers,total_fir_2017,population,urban,dist_urban,strat_pca,treatment,group,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd
0,0,47,412,129345,Rural,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0
1,1,29,218,170000,Rural,1.0,-0.960574,Control,control,1000.0,,1.545454,,0.0,0.0
2,2,75,694,102467,Rural,1.0,1.042663,Control,control,1000.0,,1.090909,,0.0,0.0
3,3,40,144,54000,Rural,1.0,-1.590761,Treatment,regular mhd,1000.0,7.0,3.897297,2.03,1.0,0.0
4,4,37,130,76376,Rural,1.0,-1.570695,Treatment,women officers,1000.0,9.0,4.157658,2.2,0.0,1.0


Unnamed: 0,0
Unnamed: 0,0
total_assigned_officers,0
total_fir_2017,0
population,0
urban,0
dist_urban,0
strat_pca,0
treatment,0
group,0
dist_id,0


Unnamed: 0_level_0,count
treatment,Unnamed: 1_level_1
Treatment,120
Control,60


Unnamed: 0,0
Unnamed: 0,0
total_assigned_officers,0
total_fir_2017,0
population,0
urban,0
dist_urban,0
strat_pca,0
treatment,0
group,0
dist_id,0


In [28]:
display(police_merged_df[police_merged_df['treatment']=='Treatment'].isnull().sum())

# Implementation and Comm outreach values are null for stations in the control group - this makes sense
# There is 1 station in the treatment group with null values for these variables which results in 7 records with missing values in this individual dataset
# Will likely be excluding these variables from any causal inference analysis as it does not exists for both experimental groups

Unnamed: 0,0
uid,0
gender,0
e_wcase,7
e_effective,0
e_false_case,12
e_helpful,1
e_add_officer,2
e_add_female,2
e_female_better,1
e_taken_seriously,3


### Bring in station level data to joined dataset

In [29]:
display(police_station_personnel.head())
display(police_station_personnel.info())

Unnamed: 0.1,Unnamed: 0,e_total_surveyed,e_female_surveyed,e_male_surveyed,ps_code,e_total_staff,e_female_staff,e_total_officers,e_female_officers,e_male_staff,e_male_officers,e_female_sho,b_total_staff,b_female_staff,b_male_staff,b_total_officers,b_female_officers,b_male_officers,b_female_sho,e_total_sampled,e_female_sampled,e_male_sampled,b_total_sampled,b_female_sampled,b_male_sampled,b_total_surveyed,b_female_surveyed,b_male_surveyed,population,urban,dist_urban,strat_pca,treatment,group,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd,e_male_weight,b_male_weight,e_female_weight,b_female_weight
0,0,12.0,1.0,11.0,1001.0,48.0,6.0,11.0,1.0,42.0,10.0,0.0,39.0,3.0,36.0,10.0,0.0,10.0,0.0,12.0,1.0,11.0,12.0,1.0,11.0,10.0,0.0,10.0,129345,Rural,1.0,-0.234041,Treatment,women officers,1000.0,9.0,2.916667,2.2,0.0,1.0,3.818182,3.272727,6.0,3.0
1,1,11.0,0.0,11.0,1002.0,29.0,2.0,6.0,0.0,27.0,6.0,0.0,20.0,1.0,19.0,7.0,1.0,6.0,0.0,11.0,0.0,11.0,12.0,1.0,11.0,11.0,0.0,11.0,170000,Rural,1.0,-0.960574,Control,control,1000.0,,1.545454,,0.0,0.0,2.454546,1.727273,,1.0
2,2,11.0,1.0,10.0,1003.0,79.0,6.0,13.0,2.0,73.0,11.0,0.0,56.0,7.0,49.0,12.0,2.0,10.0,0.0,12.0,2.0,10.0,12.0,2.0,10.0,11.0,2.0,9.0,102467,Rural,1.0,1.042663,Control,control,1000.0,,1.090909,,0.0,0.0,7.3,4.9,3.0,3.5
3,3,10.0,2.0,8.0,1004.0,33.0,3.0,7.0,1.0,30.0,6.0,0.0,22.0,4.0,18.0,4.0,1.0,3.0,0.0,13.0,3.0,10.0,13.0,3.0,10.0,13.0,3.0,10.0,54000,Rural,1.0,-1.590761,Treatment,regular mhd,1000.0,7.0,3.897297,2.03,1.0,0.0,3.0,1.8,1.0,1.333333
4,4,12.0,3.0,9.0,1005.0,32.0,7.0,4.0,2.0,25.0,2.0,1.0,28.0,3.0,25.0,6.0,0.0,6.0,0.0,12.0,3.0,9.0,12.0,2.0,10.0,12.0,2.0,10.0,76376,Rural,1.0,-1.570695,Treatment,women officers,1000.0,9.0,4.157658,2.2,0.0,1.0,2.777778,2.5,2.333333,1.5


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 44 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              180 non-null    int64  
 1   e_total_surveyed        180 non-null    float64
 2   e_female_surveyed       180 non-null    float64
 3   e_male_surveyed         180 non-null    float64
 4   ps_code                 180 non-null    float64
 5   e_total_staff           180 non-null    float64
 6   e_female_staff          180 non-null    float64
 7   e_total_officers        180 non-null    float64
 8   e_female_officers       180 non-null    float64
 9   e_male_staff            180 non-null    float64
 10  e_male_officers         180 non-null    float64
 11  e_female_sho            180 non-null    float64
 12  b_total_staff           180 non-null    float64
 13  b_female_staff          180 non-null    float64
 14  b_male_staff            180 non-null    fl

None

In [30]:
# Convert datatyoes

# IDs to strings
police_station_personnel[['ps_code', 'dist_id']] = police_station_personnel[['ps_code', 'dist_id']].astype('Int64').astype('str')

# binary to boolean
police_station_personnel[['b_female_sho', 'e_female_sho']] = police_station_personnel[['b_female_sho', 'e_female_sho']].astype('bool')


In [31]:
# Select just columns want to bring in (station level variables that are in this dataset and not in the individual level dataset)

station_cols_for_merge = [col for col in police_station_personnel if col.startswith(('b_', 'e_'))]
station_cols_for_merge.append('ps_code')
station_for_merge = police_station_personnel[station_cols_for_merge]

# Take a peek

station_for_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   e_total_surveyed   180 non-null    float64
 1   e_female_surveyed  180 non-null    float64
 2   e_male_surveyed    180 non-null    float64
 3   e_total_staff      180 non-null    float64
 4   e_female_staff     180 non-null    float64
 5   e_total_officers   180 non-null    float64
 6   e_female_officers  180 non-null    float64
 7   e_male_staff       180 non-null    float64
 8   e_male_officers    180 non-null    float64
 9   e_female_sho       180 non-null    bool   
 10  b_total_staff      180 non-null    float64
 11  b_female_staff     180 non-null    float64
 12  b_male_staff       180 non-null    float64
 13  b_total_officers   180 non-null    float64
 14  b_female_officers  180 non-null    float64
 15  b_male_officers    180 non-null    float64
 16  b_female_sho       180 non

In [32]:
# Merge with other dataset on ps_code
# Left merge - want to keep all records of individual level police dataset
print("Length of individual police dataset before merge: ", len(police_merged_df))
police_station_merge = pd.merge(police_merged_df, station_for_merge, how='left', left_on='ps_code', right_on='ps_code')
print("Length of data post-merge: ", len(police_station_merge))

Length of individual police dataset before merge:  1068
Length of data post-merge:  1068


In [33]:

# Take a peek
police_station_merge.head()

Unnamed: 0,uid,gender,e_wcase,e_effective,e_false_case,e_helpful,e_add_officer,e_add_female,e_female_better,e_taken_seriously,e_prof_dev,e_work_help,e_pol_impt,e_thana_impt,e_sensitivity,ps_code,population,urban,dist_urban,strat_pca,treatment,group,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd,b_wcase,b_effective,b_pol_impt,b_thana_impt,b_sensitivity,b_helpful,b_female_better,b_add_officer,b_add_female,e_wcase_too little attention,e_wcase_too much attention,e_effective_ineffective,e_effective_very effective,e_effective_very ineffective,e_false_case_uncommon,e_false_case_very common,e_false_case_very uncommon,e_taken_seriously_disagree,e_taken_seriously_strongly agree,e_taken_seriously_strongly disagree,e_prof_dev_disagree,e_prof_dev_strongly agree,e_prof_dev_strongly disagree,e_work_help_disagree,e_work_help_strongly agree,e_work_help_strongly disagree,e_helpful_unhelpful,e_helpful_very helpful,e_helpful_very unhelpful,e_add_officer_more effective,e_add_officer_much more effective,e_add_female_More Effective,e_add_female_Much more effective,e_add_female_No difference,e_female_better_male,e_female_better_no difference,urban_Urban,treatment_Treatment,orig_b_uid,orig_b_pol_impt,orig_b_thana_impt,orig_b_wcase,orig_b_effective,orig_b_helpful,orig_b_add_officer,orig_b_add_female,orig_b_female_better,orig_b_sensitivity,orig_b_wcase_too little attention,orig_b_wcase_too much attention,orig_b_effective_ineffective,orig_b_effective_neither effective nor ineffective,orig_b_effective_very effective,orig_b_effective_very ineffective,orig_b_helpful_neither helpful nor unhelpful,orig_b_helpful_unhelpful,orig_b_helpful_very helpful,orig_b_helpful_very unhelpful,orig_b_add_officer_more effective,orig_b_add_officer_much less effective,orig_b_add_officer_much more effective,orig_b_add_officer_no difference,orig_b_add_female_more effective,orig_b_add_female_much less effective,orig_b_add_female_much more effective,orig_b_add_female_no difference,orig_b_female_better_male,orig_b_female_better_no difference,orig_uid,e_total_surveyed,e_female_surveyed,e_male_surveyed,e_total_staff,e_female_staff,e_total_officers,e_female_officers,e_male_staff,e_male_officers,e_female_sho,b_total_staff,b_female_staff,b_male_staff,b_total_officers,b_female_officers,b_male_officers,b_female_sho,e_total_sampled,e_female_sampled,e_male_sampled,b_total_sampled,b_female_sampled,b_male_sampled,b_total_surveyed,b_female_surveyed,b_male_surveyed,e_male_weight,b_male_weight,e_female_weight,b_female_weight
0,2141733,male,too much attention,very effective,uncommon,very helpful,much more effective,Less effective,female,strongly agree,agree,agree,True,True,8.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,2.0,5.0,False,False,8.0,4.0,3.0,4.0,5.0,False,True,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,2141762,False,False,enough attention,very effective,helpful,more effective,much less effective,female,8.0,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,2141733,12.0,0.0,12.0,43.0,3.0,10.0,1.0,40.0,9.0,False,24.0,0.0,24.0,7.0,0.0,7.0,False,12.0,0.0,12.0,12.0,0.0,12.0,12.0,0.0,12.0,3.333333,2.0,,
1,2141734,male,too much attention,effective,very uncommon,helpful,much more effective,Less effective,female,agree,strongly agree,agree,True,True,9.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,3.0,5.0,False,False,8.0,5.0,3.0,5.0,5.0,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,2141764,False,False,too much attention,very effective,very helpful,much more effective,much less effective,female,8.0,False,True,False,False,True,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,2141734,12.0,0.0,12.0,43.0,3.0,10.0,1.0,40.0,9.0,False,24.0,0.0,24.0,7.0,0.0,7.0,False,12.0,0.0,12.0,12.0,0.0,12.0,12.0,0.0,12.0,3.333333,2.0,,
2,2141739,male,too much attention,very effective,very uncommon,very helpful,more effective,Less effective,female,agree,agree,agree,False,False,7.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,2.0,4.0,False,False,5.0,4.0,1.0,4.0,5.0,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,2141765,False,False,enough attention,effective,helpful,more effective,much less effective,male,5.0,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False,2141739,12.0,0.0,12.0,43.0,3.0,10.0,1.0,40.0,9.0,False,24.0,0.0,24.0,7.0,0.0,7.0,False,12.0,0.0,12.0,12.0,0.0,12.0,12.0,0.0,12.0,3.333333,2.0,,
3,2141742,male,too much attention,very effective,uncommon,helpful,much more effective,No difference,no difference,agree,agree,agree,True,False,5.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,3.0,5.0,False,False,8.0,4.0,1.0,4.0,5.0,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,True,2141766,False,False,too much attention,very effective,helpful,more effective,much less effective,male,8.0,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False,2141742,12.0,0.0,12.0,43.0,3.0,10.0,1.0,40.0,9.0,False,24.0,0.0,24.0,7.0,0.0,7.0,False,12.0,0.0,12.0,12.0,0.0,12.0,12.0,0.0,12.0,3.333333,2.0,,
4,2141740,male,enough attention,effective,uncommon,very helpful,much more effective,Less effective,female,strongly agree,strongly agree,strongly agree,False,True,7.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,3.0,5.0,False,False,6.0,4.0,3.0,5.0,4.0,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,True,2141767,False,False,too much attention,very effective,helpful,much more effective,less effective,female,6.0,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,2141740,12.0,0.0,12.0,43.0,3.0,10.0,1.0,40.0,9.0,False,24.0,0.0,24.0,7.0,0.0,7.0,False,12.0,0.0,12.0,12.0,0.0,12.0,12.0,0.0,12.0,3.333333,2.0,,


### Create numeric versions of endline survey variables

In [34]:
# Look at the possible values for these columns
for col in police_station_merge:
  if (police_station_merge[col].dtype== 'category' and col.startswith('e_')):
    display(police_station_merge[col].value_counts())

for col in police_station_merge:
  if (police_station_merge[col].dtype== 'category' and col.startswith('orig_b')):
    display(police_station_merge[col].value_counts())
    display(police_station_merge[col[5:]].value_counts())

Unnamed: 0_level_0,count
e_wcase,Unnamed: 1_level_1
too much attention,783
enough attention,275
too little attention,3


Unnamed: 0_level_0,count
e_effective,Unnamed: 1_level_1
very effective,629
effective,433
ineffective,3
very ineffective,2


Unnamed: 0_level_0,count
e_false_case,Unnamed: 1_level_1
uncommon,364
common,265
very uncommon,214
very common,208


Unnamed: 0_level_0,count
e_helpful,Unnamed: 1_level_1
very helpful,710
helpful,352
unhelpful,3
very unhelpful,2


Unnamed: 0_level_0,count
e_add_officer,Unnamed: 1_level_1
much more effective,684
more effective,376
less effective,4


Unnamed: 0_level_0,count
e_add_female,Unnamed: 1_level_1
Less effective,680
No difference,372
More Effective,13
Much more effective,0


Unnamed: 0_level_0,count
e_female_better,Unnamed: 1_level_1
female,826
no difference,189
male,51


Unnamed: 0_level_0,count
e_taken_seriously,Unnamed: 1_level_1
agree,743
strongly agree,296
disagree,21
strongly disagree,3


Unnamed: 0_level_0,count
e_prof_dev,Unnamed: 1_level_1
agree,744
strongly agree,226
disagree,85
strongly disagree,10


Unnamed: 0_level_0,count
e_work_help,Unnamed: 1_level_1
agree,457
disagree,438
strongly disagree,111
strongly agree,58


Unnamed: 0_level_0,count
orig_b_wcase,Unnamed: 1_level_1
too much attention,750
enough attention,311
too little attention,5


Unnamed: 0_level_0,count
b_wcase,Unnamed: 1_level_1
3.0,751
2.0,311
1.0,5
2.4,1


Unnamed: 0_level_0,count
orig_b_effective,Unnamed: 1_level_1
very effective,695
effective,351
neither effective nor ineffective,11
very ineffective,4
ineffective,2


Unnamed: 0_level_0,count
b_effective,Unnamed: 1_level_1
5.0,695
4.0,351
3.0,11
1.0,4
2.0,2
4.454546,1
4.5,1
4.636363,1
4.625,1
4.1,1


Unnamed: 0_level_0,count
orig_b_helpful,Unnamed: 1_level_1
very helpful,686
helpful,369
neither helpful nor unhelpful,8
unhelpful,2
very unhelpful,1


Unnamed: 0_level_0,count
b_helpful,Unnamed: 1_level_1
5.0,687
4.0,369
3.0,8
2.0,2
1.0,1
4.666666,1


Unnamed: 0_level_0,count
orig_b_add_officer,Unnamed: 1_level_1
much more effective,523
more effective,488
no difference,48
less effective,7
much less effective,2


Unnamed: 0_level_0,count
b_add_officer,Unnamed: 1_level_1
5.0,523
4.0,488
3.0,48
2.0,7
1.0,2


Unnamed: 0_level_0,count
orig_b_add_female,Unnamed: 1_level_1
much less effective,663
less effective,337
no difference,56
more effective,8
much more effective,4


Unnamed: 0_level_0,count
b_add_female,Unnamed: 1_level_1
5.0,663
4.0,337
3.0,56
2.0,8
1.0,4


Unnamed: 0_level_0,count
orig_b_female_better,Unnamed: 1_level_1
female,855
no difference,139
male,72


Unnamed: 0_level_0,count
b_female_better,Unnamed: 1_level_1
3.0,855
2.0,139
1.0,72
2.9,1
2.75,1


In [35]:
# create dictionaries for mapping
wcase = {'not enough attention':1,'too little attention':1, 'enough attention':2, 'too much attention':3}
effective = {'very ineffective':1, 'ineffective':2, 'neither effective nor ineffective':3, 'effective':4, 'very effective':5}
false_case = {'very uncommon':1, 'uncommon':2, 'common':3, 'very common':4}
helpful = {'very unhelpful':1, 'unhelpful':2, 'neither helpful nor unhelpful':3, 'helpful':4, 'very helpful':5}
add_officer = {'much less effective':1, 'less effective':2, 'no difference':3, 'more effective':4, 'much more effective':5}
add_female = {'much less effective':5, 'less effective':4, 'no difference':3, 'more effective':2, 'much more effective':1}
female_better = {'male':1, 'no difference':2, 'female':3}
taken_seriously = {'strongly disagree':1, 'disagree':2, 'agree':3, 'strongly agree':4}
prof_dev = {'strongly disagree':1, 'disagree':2, 'agree':3, 'strongly agree':4}
work_help = {'strongly disagree':1, 'disagree':2, 'agree':3, 'strongly agree':4}

In [36]:
# map to new columns for endline and baseline to ensure consistency
# ensure lowercase for proper mapping
police_station_merge['e_wcase_num'] = police_station_merge['e_wcase'].str.lower().map(wcase, na_action='ignore')
police_station_merge['e_effective_num'] = police_station_merge['e_effective'].str.lower().map(effective, na_action='ignore')
police_station_merge['e_false_case_num'] = police_station_merge['e_false_case'].str.lower().map(false_case, na_action='ignore')
police_station_merge['e_helpful_num'] = police_station_merge['e_helpful'].str.lower().map(helpful, na_action='ignore')
police_station_merge['e_add_officer_num'] = police_station_merge['e_add_officer'].str.lower().map(add_officer, na_action='ignore')
police_station_merge['e_add_female_num'] = police_station_merge['e_add_female'].str.lower().map(add_female, na_action='ignore')
police_station_merge['e_female_better_num'] = police_station_merge['e_female_better'].str.lower().map(female_better, na_action='ignore')
police_station_merge['e_taken_seriously_num'] = police_station_merge['e_taken_seriously'].str.lower().map(taken_seriously, na_action='ignore')
police_station_merge['e_prof_dev_num'] = police_station_merge['e_prof_dev'].str.lower().map(prof_dev, na_action='ignore')
police_station_merge['e_work_help_num'] = police_station_merge['e_work_help'].str.lower().map(work_help, na_action='ignore')

police_station_merge['b_wcase_num'] = police_station_merge['orig_b_wcase'].str.lower().map(wcase, na_action='ignore')
police_station_merge['b_effective_num'] = police_station_merge['orig_b_effective'].str.lower().map(effective, na_action='ignore')
police_station_merge['b_helpful_num'] = police_station_merge['orig_b_helpful'].str.lower().map(helpful, na_action='ignore')
police_station_merge['b_add_officer_num'] = police_station_merge['orig_b_add_officer'].str.lower().map(add_officer, na_action='ignore')
police_station_merge['b_add_female_num'] = police_station_merge['orig_b_add_female'].str.lower().map(add_female, na_action='ignore')
police_station_merge['b_female_better_num'] = police_station_merge['orig_b_female_better'].str.lower().map(female_better, na_action='ignore')


In [37]:
police_station_merge.head()

Unnamed: 0,uid,gender,e_wcase,e_effective,e_false_case,e_helpful,e_add_officer,e_add_female,e_female_better,e_taken_seriously,e_prof_dev,e_work_help,e_pol_impt,e_thana_impt,e_sensitivity,ps_code,population,urban,dist_urban,strat_pca,treatment,group,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd,b_wcase,b_effective,b_pol_impt,b_thana_impt,b_sensitivity,b_helpful,b_female_better,b_add_officer,b_add_female,e_wcase_too little attention,e_wcase_too much attention,e_effective_ineffective,e_effective_very effective,e_effective_very ineffective,e_false_case_uncommon,e_false_case_very common,e_false_case_very uncommon,e_taken_seriously_disagree,e_taken_seriously_strongly agree,e_taken_seriously_strongly disagree,e_prof_dev_disagree,e_prof_dev_strongly agree,e_prof_dev_strongly disagree,e_work_help_disagree,e_work_help_strongly agree,e_work_help_strongly disagree,e_helpful_unhelpful,e_helpful_very helpful,e_helpful_very unhelpful,e_add_officer_more effective,e_add_officer_much more effective,e_add_female_More Effective,e_add_female_Much more effective,e_add_female_No difference,e_female_better_male,e_female_better_no difference,urban_Urban,treatment_Treatment,orig_b_uid,orig_b_pol_impt,orig_b_thana_impt,orig_b_wcase,orig_b_effective,orig_b_helpful,orig_b_add_officer,orig_b_add_female,orig_b_female_better,orig_b_sensitivity,orig_b_wcase_too little attention,orig_b_wcase_too much attention,orig_b_effective_ineffective,orig_b_effective_neither effective nor ineffective,orig_b_effective_very effective,orig_b_effective_very ineffective,orig_b_helpful_neither helpful nor unhelpful,orig_b_helpful_unhelpful,orig_b_helpful_very helpful,orig_b_helpful_very unhelpful,orig_b_add_officer_more effective,orig_b_add_officer_much less effective,orig_b_add_officer_much more effective,orig_b_add_officer_no difference,orig_b_add_female_more effective,orig_b_add_female_much less effective,orig_b_add_female_much more effective,orig_b_add_female_no difference,orig_b_female_better_male,orig_b_female_better_no difference,orig_uid,e_total_surveyed,e_female_surveyed,e_male_surveyed,e_total_staff,e_female_staff,e_total_officers,e_female_officers,e_male_staff,e_male_officers,e_female_sho,b_total_staff,b_female_staff,b_male_staff,b_total_officers,b_female_officers,b_male_officers,b_female_sho,e_total_sampled,e_female_sampled,e_male_sampled,b_total_sampled,b_female_sampled,b_male_sampled,b_total_surveyed,b_female_surveyed,b_male_surveyed,e_male_weight,b_male_weight,e_female_weight,b_female_weight,e_wcase_num,e_effective_num,e_false_case_num,e_helpful_num,e_add_officer_num,e_add_female_num,e_female_better_num,e_taken_seriously_num,e_prof_dev_num,e_work_help_num,b_wcase_num,b_effective_num,b_helpful_num,b_add_officer_num,b_add_female_num,b_female_better_num
0,2141733,male,too much attention,very effective,uncommon,very helpful,much more effective,Less effective,female,strongly agree,agree,agree,True,True,8.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,2.0,5.0,False,False,8.0,4.0,3.0,4.0,5.0,False,True,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,2141762,False,False,enough attention,very effective,helpful,more effective,much less effective,female,8.0,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,2141733,12.0,0.0,12.0,43.0,3.0,10.0,1.0,40.0,9.0,False,24.0,0.0,24.0,7.0,0.0,7.0,False,12.0,0.0,12.0,12.0,0.0,12.0,12.0,0.0,12.0,3.333333,2.0,,,3.0,5.0,2.0,5.0,5.0,4.0,3.0,4.0,3.0,3.0,2.0,5.0,4.0,4,5,3.0
1,2141734,male,too much attention,effective,very uncommon,helpful,much more effective,Less effective,female,agree,strongly agree,agree,True,True,9.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,3.0,5.0,False,False,8.0,5.0,3.0,5.0,5.0,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,2141764,False,False,too much attention,very effective,very helpful,much more effective,much less effective,female,8.0,False,True,False,False,True,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,2141734,12.0,0.0,12.0,43.0,3.0,10.0,1.0,40.0,9.0,False,24.0,0.0,24.0,7.0,0.0,7.0,False,12.0,0.0,12.0,12.0,0.0,12.0,12.0,0.0,12.0,3.333333,2.0,,,3.0,4.0,1.0,4.0,5.0,4.0,3.0,3.0,4.0,3.0,3.0,5.0,5.0,5,5,3.0
2,2141739,male,too much attention,very effective,very uncommon,very helpful,more effective,Less effective,female,agree,agree,agree,False,False,7.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,2.0,4.0,False,False,5.0,4.0,1.0,4.0,5.0,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,2141765,False,False,enough attention,effective,helpful,more effective,much less effective,male,5.0,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False,2141739,12.0,0.0,12.0,43.0,3.0,10.0,1.0,40.0,9.0,False,24.0,0.0,24.0,7.0,0.0,7.0,False,12.0,0.0,12.0,12.0,0.0,12.0,12.0,0.0,12.0,3.333333,2.0,,,3.0,5.0,1.0,5.0,4.0,4.0,3.0,3.0,3.0,3.0,2.0,4.0,4.0,4,5,1.0
3,2141742,male,too much attention,very effective,uncommon,helpful,much more effective,No difference,no difference,agree,agree,agree,True,False,5.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,3.0,5.0,False,False,8.0,4.0,1.0,4.0,5.0,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,True,2141766,False,False,too much attention,very effective,helpful,more effective,much less effective,male,8.0,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False,2141742,12.0,0.0,12.0,43.0,3.0,10.0,1.0,40.0,9.0,False,24.0,0.0,24.0,7.0,0.0,7.0,False,12.0,0.0,12.0,12.0,0.0,12.0,12.0,0.0,12.0,3.333333,2.0,,,3.0,5.0,2.0,4.0,5.0,3.0,2.0,3.0,3.0,3.0,3.0,5.0,4.0,4,5,1.0
4,2141740,male,enough attention,effective,uncommon,very helpful,much more effective,Less effective,female,strongly agree,strongly agree,strongly agree,False,True,7.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,3.0,5.0,False,False,6.0,4.0,3.0,5.0,4.0,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,True,2141767,False,False,too much attention,very effective,helpful,much more effective,less effective,female,6.0,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,2141740,12.0,0.0,12.0,43.0,3.0,10.0,1.0,40.0,9.0,False,24.0,0.0,24.0,7.0,0.0,7.0,False,12.0,0.0,12.0,12.0,0.0,12.0,12.0,0.0,12.0,3.333333,2.0,,,2.0,4.0,2.0,5.0,5.0,4.0,3.0,4.0,4.0,4.0,3.0,5.0,4.0,5,4,3.0


### Calculate difference endline-baseline

In [38]:
police_station_merge['wcase_change'] = police_station_merge['e_wcase_num'] - police_station_merge['b_wcase_num']
police_station_merge['effective_change'] = police_station_merge['e_effective_num'] - police_station_merge['b_effective_num']
police_station_merge['helpful_change'] = police_station_merge['e_helpful_num'] - police_station_merge['b_helpful_num']
police_station_merge['add_officer_change'] = police_station_merge['e_add_officer_num'] - police_station_merge['b_add_officer_num']
police_station_merge['add_female_change'] = police_station_merge['e_add_female_num'] - police_station_merge['b_add_female_num']
police_station_merge['female_better_change'] = police_station_merge['e_female_better_num'] -police_station_merge['b_female_better_num']

In [39]:
police_station_merge.head()

Unnamed: 0,uid,gender,e_wcase,e_effective,e_false_case,e_helpful,e_add_officer,e_add_female,e_female_better,e_taken_seriously,e_prof_dev,e_work_help,e_pol_impt,e_thana_impt,e_sensitivity,ps_code,population,urban,dist_urban,strat_pca,treatment,group,dist_id,implement_quality,training_score,comm_outreach_strength,regular_whd,women_whd,b_wcase,b_effective,b_pol_impt,b_thana_impt,b_sensitivity,b_helpful,b_female_better,b_add_officer,b_add_female,e_wcase_too little attention,e_wcase_too much attention,e_effective_ineffective,e_effective_very effective,e_effective_very ineffective,e_false_case_uncommon,e_false_case_very common,e_false_case_very uncommon,e_taken_seriously_disagree,e_taken_seriously_strongly agree,e_taken_seriously_strongly disagree,e_prof_dev_disagree,e_prof_dev_strongly agree,e_prof_dev_strongly disagree,e_work_help_disagree,e_work_help_strongly agree,e_work_help_strongly disagree,e_helpful_unhelpful,e_helpful_very helpful,e_helpful_very unhelpful,e_add_officer_more effective,e_add_officer_much more effective,e_add_female_More Effective,e_add_female_Much more effective,e_add_female_No difference,e_female_better_male,e_female_better_no difference,urban_Urban,treatment_Treatment,orig_b_uid,orig_b_pol_impt,orig_b_thana_impt,orig_b_wcase,orig_b_effective,orig_b_helpful,orig_b_add_officer,orig_b_add_female,orig_b_female_better,orig_b_sensitivity,orig_b_wcase_too little attention,orig_b_wcase_too much attention,orig_b_effective_ineffective,orig_b_effective_neither effective nor ineffective,orig_b_effective_very effective,orig_b_effective_very ineffective,orig_b_helpful_neither helpful nor unhelpful,orig_b_helpful_unhelpful,orig_b_helpful_very helpful,orig_b_helpful_very unhelpful,orig_b_add_officer_more effective,orig_b_add_officer_much less effective,orig_b_add_officer_much more effective,orig_b_add_officer_no difference,orig_b_add_female_more effective,orig_b_add_female_much less effective,orig_b_add_female_much more effective,orig_b_add_female_no difference,orig_b_female_better_male,orig_b_female_better_no difference,orig_uid,e_total_surveyed,e_female_surveyed,e_male_surveyed,e_total_staff,e_female_staff,e_total_officers,e_female_officers,e_male_staff,e_male_officers,e_female_sho,b_total_staff,b_female_staff,b_male_staff,b_total_officers,b_female_officers,b_male_officers,b_female_sho,e_total_sampled,e_female_sampled,e_male_sampled,b_total_sampled,b_female_sampled,b_male_sampled,b_total_surveyed,b_female_surveyed,b_male_surveyed,e_male_weight,b_male_weight,e_female_weight,b_female_weight,e_wcase_num,e_effective_num,e_false_case_num,e_helpful_num,e_add_officer_num,e_add_female_num,e_female_better_num,e_taken_seriously_num,e_prof_dev_num,e_work_help_num,b_wcase_num,b_effective_num,b_helpful_num,b_add_officer_num,b_add_female_num,b_female_better_num,wcase_change,effective_change,helpful_change,add_officer_change,add_female_change,female_better_change
0,2141733,male,too much attention,very effective,uncommon,very helpful,much more effective,Less effective,female,strongly agree,agree,agree,True,True,8.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,2.0,5.0,False,False,8.0,4.0,3.0,4.0,5.0,False,True,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,2141762,False,False,enough attention,very effective,helpful,more effective,much less effective,female,8.0,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,2141733,12.0,0.0,12.0,43.0,3.0,10.0,1.0,40.0,9.0,False,24.0,0.0,24.0,7.0,0.0,7.0,False,12.0,0.0,12.0,12.0,0.0,12.0,12.0,0.0,12.0,3.333333,2.0,,,3.0,5.0,2.0,5.0,5.0,4.0,3.0,4.0,3.0,3.0,2.0,5.0,4.0,4,5,3.0,1.0,0.0,1.0,1.0,-1.0,0.0
1,2141734,male,too much attention,effective,very uncommon,helpful,much more effective,Less effective,female,agree,strongly agree,agree,True,True,9.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,3.0,5.0,False,False,8.0,5.0,3.0,5.0,5.0,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,2141764,False,False,too much attention,very effective,very helpful,much more effective,much less effective,female,8.0,False,True,False,False,True,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,2141734,12.0,0.0,12.0,43.0,3.0,10.0,1.0,40.0,9.0,False,24.0,0.0,24.0,7.0,0.0,7.0,False,12.0,0.0,12.0,12.0,0.0,12.0,12.0,0.0,12.0,3.333333,2.0,,,3.0,4.0,1.0,4.0,5.0,4.0,3.0,3.0,4.0,3.0,3.0,5.0,5.0,5,5,3.0,0.0,-1.0,-1.0,0.0,-1.0,0.0
2,2141739,male,too much attention,very effective,very uncommon,very helpful,more effective,Less effective,female,agree,agree,agree,False,False,7.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,2.0,4.0,False,False,5.0,4.0,1.0,4.0,5.0,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,2141765,False,False,enough attention,effective,helpful,more effective,much less effective,male,5.0,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False,2141739,12.0,0.0,12.0,43.0,3.0,10.0,1.0,40.0,9.0,False,24.0,0.0,24.0,7.0,0.0,7.0,False,12.0,0.0,12.0,12.0,0.0,12.0,12.0,0.0,12.0,3.333333,2.0,,,3.0,5.0,1.0,5.0,4.0,4.0,3.0,3.0,3.0,3.0,2.0,4.0,4.0,4,5,1.0,1.0,1.0,1.0,0.0,-1.0,2.0
3,2141742,male,too much attention,very effective,uncommon,helpful,much more effective,No difference,no difference,agree,agree,agree,True,False,5.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,3.0,5.0,False,False,8.0,4.0,1.0,4.0,5.0,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,True,2141766,False,False,too much attention,very effective,helpful,more effective,much less effective,male,8.0,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False,2141742,12.0,0.0,12.0,43.0,3.0,10.0,1.0,40.0,9.0,False,24.0,0.0,24.0,7.0,0.0,7.0,False,12.0,0.0,12.0,12.0,0.0,12.0,12.0,0.0,12.0,3.333333,2.0,,,3.0,5.0,2.0,4.0,5.0,3.0,2.0,3.0,3.0,3.0,3.0,5.0,4.0,4,5,1.0,0.0,0.0,0.0,1.0,-2.0,1.0
4,2141740,male,enough attention,effective,uncommon,very helpful,much more effective,Less effective,female,strongly agree,strongly agree,strongly agree,False,True,7.0,2011,58432,Rural,2.0,-1.332458,Treatment,women officers,2000,9.0,3.77907,2.16,False,True,3.0,5.0,False,False,6.0,4.0,3.0,5.0,4.0,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,True,2141767,False,False,too much attention,very effective,helpful,much more effective,less effective,female,6.0,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,2141740,12.0,0.0,12.0,43.0,3.0,10.0,1.0,40.0,9.0,False,24.0,0.0,24.0,7.0,0.0,7.0,False,12.0,0.0,12.0,12.0,0.0,12.0,12.0,0.0,12.0,3.333333,2.0,,,2.0,4.0,2.0,5.0,5.0,4.0,3.0,4.0,4.0,4.0,3.0,5.0,4.0,5,4,3.0,-1.0,-1.0,1.0,0.0,0.0,0.0


### For causal analysis, will make a subset of this dataset which picks the change variable desired, treatment, and any variables that may be relevant as to decrease bias and determine a causal relationship between treatment and the chosen outcome variable

In [40]:
# add numerical mapping to baseline data
police_baseline_all['b_wcase_num'] = police_baseline_all['b_wcase'].str.lower().map(wcase, na_action='ignore')
police_baseline_all['b_effective_num'] = police_baseline_all['b_effective'].str.lower().map(effective, na_action='ignore')
police_baseline_all['b_helpful_num'] = police_baseline_all['b_helpful'].str.lower().map(helpful, na_action='ignore')
police_baseline_all['b_add_officer_num'] = police_baseline_all['b_add_officer'].str.lower().map(add_officer, na_action='ignore')
police_baseline_all['b_add_female_num'] = police_baseline_all['b_add_female'].str.lower().map(add_female, na_action='ignore')
police_baseline_all['b_female_better_num'] = police_baseline_all['b_female_better'].str.lower().map(female_better, na_action='ignore')

### Exporting cleaned dataframes into csv
- police_baseline_all
- police_station_merge

In [41]:
police_baseline_all.to_csv("police_baseline_all.csv", index=False)
police_station_merge.to_csv("police_station_merge.csv", index=False)