In [1]:
# import files
import pandas as pd
import numpy as np

df = pd.read_csv('./data/finalized/survey.csv', header=[0,1])
df.columns = df.columns.map('_'.join)

print(df.shape)

(56, 303)


In [2]:
# add response id to avoid double counting post-pivot in tableau
df['response_id'] = np.arange(df.shape[0])
print(df['response_id'].head(5))

0    0
1    1
2    2
3    3
4    4
Name: response_id, dtype: int64


In [3]:
# pivot data to have one row per response per question
id_vars = list(df.columns)[:18]
id_vars.append('response_id')

value_vars = [x for x in [x for x in df.columns if x not in id_vars] if '_Response' in x]

print(df.shape)
df = pd.melt(df, id_vars = id_vars, value_vars = value_vars, var_name = 'question', value_name = 'response')
print(df.shape)
print(df.head(5))

(56, 304)
(8064, 21)
  How many total students are currently enrolled in your school(s)?_Response  \
0                                        501 or more                           
1                                        501 or more                           
2                                        501 or more                           
3                                        501 or more                           
4                                         251 to 500                           

  Location of your school(s) (please check all that apply):_Boston  \
0                                                NaN                 
1                                             Boston                 
2                                             Boston                 
3                                                NaN                 
4                                             Boston                 

  Unnamed: 2_level_0_Non-Boston Urban/Gateway City  \
0                      

In [6]:
# create question id to use for sorting later
df_questions = df[['question']].drop_duplicates()
df_questions['question_sort'] = np.arange(df_questions.shape[0])
print(df_questions.head(5))
print(df.shape)
df = df.merge(df_questions, on='question')
print(df.shape)

                                              question  question_sort
0    What is the average annual salary of a teacher...              0
56   What is the average annual salary of a teacher...              1
112  What is the average annual salary of a teacher...              2
168  What is the average annual salary of a teacher...              3
224  What is the average annual salary of a teacher...              4
(8064, 21)
(8064, 22)


In [7]:
# pivot location type fields
value_vars = ['Location of your school(s) (please check all that apply):_Boston', 'Unnamed: 2_level_0_Non-Boston Urban/Gateway City', 'Unnamed: 3_level_0_Suburban', 'Unnamed: 4_level_0_Rural']
id_vars = [x for x in df.columns if x not in value_vars]

print(df.shape)
df = pd.melt(df, id_vars = id_vars, value_vars = value_vars, value_name = 'location_type')
del df['variable']
df = df.dropna(subset=['location_type'])
print(df.shape)

(8064, 22)
(9072, 19)


In [8]:
# pivot geographic location fields
value_vars = ['Geographic location of your school(s):_Boston', 'Unnamed: 6_level_0_Central MA', 'Unnamed: 7_level_0_Eastern MA', 'Unnamed: 8_level_0_Southeastern MA', 'Unnamed: 9_level_0_Western MA']
id_vars = [x for x in df.columns if x not in value_vars]

print(df.shape)
df = pd.melt(df, id_vars = id_vars, value_vars = value_vars, value_name = 'geo_location')
del df['variable']
df = df.dropna(subset=['geo_location'])
print(df.shape)

(9072, 19)
(9936, 15)


In [9]:
# pivot grades served fields
value_vars = ['Grades currently served by your school(s) (please check all that apply):_K-12 School', 'Unnamed: 12_level_0_Elementary School', 'Unnamed: 13_level_0_Middle School', 'Unnamed: 14_level_0_Elementary/Middle School', 'Unnamed: 15_level_0_Middle/High School', 'Unnamed: 16_level_0_High School']
id_vars = [x for x in df.columns if x not in value_vars]

print(df.shape)
df = pd.melt(df, id_vars = id_vars, value_vars = value_vars, value_name = 'grades_served')
del df['variable']
df = df.dropna(subset=['grades_served'])
print(df.shape)

(9936, 15)
(15840, 10)


In [10]:
# clean question strings
df['question'] = df['question'].str.replace('_Response', '')

In [12]:
# sort by question sort
df.sort_values('question_sort', inplace=True)

In [13]:
# export final dataset
df.to_csv('./data/finalized/survey_finalized.csv', index=False)