In [1]:
# import all the libraries

# Data exploration and analysis tools
import pandas as pd
import seaborn as sns
import numpy as np
from ast import literal_eval
import re as re

In [2]:
survey_data = pd.read_csv('San_Francisco_City_Survey_Data_1996-2017.csv')

In [3]:
survey_df = pd.DataFrame(data=survey_data)

In [4]:
total_columns = survey_df.columns
print(len(total_columns))
print(survey_df.columns.nunique())

92
92


In [5]:
# Data Dictionary for survey data
column_names = {
    'id':'Unique id',
    'year':'Survey year',
    'mode':'survey mode',
    'language':'survey language',
    'dlivedsf':'Length of SF residence 1996-2009 (Groupings change in 2011)',
    'primlang_1':'primary language 1',
    'primlang_2':'primary language',
    'primlang_3':'primary language',
    'primlang_4':'primary_language',
    'dage':'Respondents age group (Age groups change in 2011, 2017)',
    'dethnic':'Respondents ethnicity',
    'mixed_1':'mixed race or ethnics',
    'mixed_2':'mixed race or ethnics',
    'mixed_3':'mixed race or ethnics',
    'mixed_4':'mixed race or ethnics',
    'deduc':'Respondents highest education completed',
    'dincome':'Household income year prior to survey',
    'dhouse':'Number of people in household',
    'ownrenhm':'Own or rent home',
    'gender':'Respondents sex',
    'dsexornt':'Respondents sexual orientation',
    'zipcode':'zipcode',
    'district':'Supervisorial District',
    'movesf':'Likelihood of moving away from SF in the next 3 years',
    'disablephys':'physically disabled',
    'disablement':'mentally disabled'    
}

mode_dict={
    1:'phone',
    2:'mail',
    3:'web/phone',
    4:'web/mail'
}

language_dict={
    1:'English',
    2:'Spanish',
    3:'Chinese',
    4:'Tagalog'
}

dividedsf_dict={
    1:'Less than 2 years',
    2:'3-5 years',
    3:'6-10 years',
    4:'11-20 years',
    5:'21-30 years',
    6:'more than 30 years',
    7:'Dont know/No answer'
}

dage_dict={
    1:'18-24',
    2:'25-34',
    3:'35-44',
    4:'45-54',
    5:'55-64 (2017 = 55-59)',
    6:'65+ (2017=60-64)',
    7:'65+ (2017)',
    8:'No answer'
}

dethnic_dict={
    1:'Black/African American',
    2:'Asian or Pacific Islander',
    3:'Latino/Hispanic',
    4:'Native American/Indian',
    5:'White/Caucasian',
    6:'Other',
    7:'Mixed Ethnicity',
    8:'Dont know',
    9:'Refused/No answer',
    10:'Pacific Islander',
    11:'Arab / Middle Eastern /North African ( 2015 Only); Arab,Middle Eastern, South Asian (2017)',
    12:'Mixed Unspecified',
    13:'Caribbean (2017)'
}

deduc_dict={
    1:'Less than high school',
    2:'High school',
    3:'Less than 4 years of college',
    4:'4 or more years of college/Post Graduate',
    5:'No answer',
}

dincome_dict={
    1:'$10,000 or less',
    2:'$10,001 to $25,000',
    3:'$25,001 to $35,000 (2015+)',
    4:'$35,001 to $50,000 (2015+)',
    5:'$50,001 to $100,000',
    6:'$100,001 to $200,000',
    7:'Over $200,000',
    8:'Refused',
    9:'$25,000-$49,999 (1996-2013)',
}

dhouse_dict={
    1:'1 (just me)',
    2:'2',
    3:'3',
    4:'4',
    5:'5 or more',
    6:'6 or more (2015, 2017)',
    7:'No answer/refused',
}

gender_dict={
    1:'Female',
    2:'Male',
    3:'Other',
    4:'No answer/Refused',
}

movesf_dict={
    1:'Very likely',
    2:'Somewhat likely',
    3:'Not too likely',
    4:'Not at all likely',
    5:'Dont know/No answer'
}

In [6]:
survey_info = ['id','year','mode','language']
demographics = ['dlivedsf','dage','dethnic','deduc','dincome','dhouse','gender','zipcode','movesf']
active_columns = survey_info + demographics

print(len(survey_info))
print(len(demographics))
print(len(active_columns))

print(active_columns)

4
9
13
['id', 'year', 'mode', 'language', 'dlivedsf', 'dage', 'dethnic', 'deduc', 'dincome', 'dhouse', 'gender', 'zipcode', 'movesf']


In [7]:
discard_columns = []

for t in total_columns:
    if t not in active_columns:
        discard_columns.append(t)


In [8]:
print(discard_columns)

['finweigh', 'general', 'stpvnbrd', 'swcndnbrd', 'cityligh', 'wtrswr', 'water', 'sewer', 'swclnbrd', 'stclnbrd', 'stswc', 'recparsy', 'parkvis', 'athfield', 'recpart', 'parkfa', 'recpract', 'parkgr', 'parklp', 'parkcl', 'libsystm', 'libmai', 'libbra', 'libonlin', 'libbo', 'onlibsvc', 'netlibsvc', 'libsta', 'conmalib', 'connelib', 'trspwlk', 'trsppub', 'trspbike', 'trsptaxi', 'trspdaln', 'trspcpl', 'trspptns', 'trspuber', 'munovr', 'mun12mth', 'muntim', 'muncle', 'munsaf', 'munmgmtcro', 'muncou', 'safdnb', 'safnnb', 'hrd311', 'cont311', 'use311sv', 'info311', 'infowebm', 'svc311', 'svcwebmo', 'phone311', 'web311', 'kids', 'k0to5', 'k6to18', 'kprisch', 'kpubsch', 'knosch', 'kqschl', 'ownrenhm', 'dsexornt', 'primlang_1', 'primlang_2', 'primlang_3', 'primlang_4', 'disablephys', 'disablement', 'district', 'mixed_1', 'mixed_2', 'mixed_3', 'mixed_4', 'issue1', 'issue2', 'issue3']


In [9]:
survey_df_clean = survey_df.copy()
survey_df_clean = survey_df_clean.drop(columns=discard_columns)
survey_df_clean = survey_df_clean[survey_df_clean['year'].isin(['2018','2017','2016','2015'])]

In [10]:
survey_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4345 entries, 16699 to 37971
Data columns (total 13 columns):
id          4345 non-null int64
year        4345 non-null int64
mode        4345 non-null float64
language    4345 non-null float64
dhouse      4345 non-null float64
dlivedsf    4345 non-null float64
movesf      4345 non-null float64
dincome     4345 non-null float64
dage        4345 non-null float64
gender      4345 non-null float64
dethnic     4345 non-null float64
deduc       4345 non-null float64
zipcode     2166 non-null float64
dtypes: float64(11), int64(2)
memory usage: 475.2 KB


In [14]:
survey_df_clean_null = survey_df_clean[survey_df_clean.isnull().any(axis=1)]
survey_df_clean_value = survey_df_clean.copy()
survey_df_clean_value = survey_df_clean_value.dropna()


In [15]:
survey_df_clean_null['year'].unique()

array([2015])

In [16]:
survey_df_clean_value['year'].unique()

array([2017])

In [17]:
survey_df_clean_value['zipcode'].nunique()

31

In [18]:
# I can only use 2017 survey data

survey_df_clean_value.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2166 entries, 16699 to 37971
Data columns (total 13 columns):
id          2166 non-null int64
year        2166 non-null int64
mode        2166 non-null float64
language    2166 non-null float64
dhouse      2166 non-null float64
dlivedsf    2166 non-null float64
movesf      2166 non-null float64
dincome     2166 non-null float64
dage        2166 non-null float64
gender      2166 non-null float64
dethnic     2166 non-null float64
deduc       2166 non-null float64
zipcode     2166 non-null float64
dtypes: float64(11), int64(2)
memory usage: 236.9 KB


In [21]:
survey_mode_df = survey_df_clean_value['mode'].map(mode_dict)

In [None]:
#survey_df_clean_value.to_csv(path_or_buf='/Users/lizchan/ds_foundations/final_project/survey_clean_2017.csv')