-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_agency_data.py
128 lines (95 loc) · 4.3 KB
/
get_agency_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import numpy as np
import pandas as pd
import json
from scipy import stats
import sqlite3
from pandas.io import sql
sqlite_db = 'miniproject_db.sqlite'
conn = sqlite3.connect(sqlite_db)
c = conn.cursor()
def get_mhsa_ids_for_state(focus_state):
query = """
SELECT MHSA_idx, count(1) as number_of_agencies
FROM agency_hitlist
WHERE state_abbr = '%s'
GROUP BY MHSA_idx;
"""
grouped_MHSA_ids = sql.read_sql(query%(focus_state), con=conn)
my_mhsa_ids=list(grouped_MHSA_ids['MHSA_idx'].values)
return my_mhsa_ids,grouped_MHSA_ids['number_of_agencies'].sum()
def get_agency_data(my_mhsa_ids):
'''
go through all the saveed .json munge and pool the crime data
save as list of dictionaries. each dict corresponds to an MHSA row from our intital shortlist
also calculate p-Value from t-test
'''
# my_mhsa_ids=[18]
agency_grouped_crime_data=[]
# for idx in range(len(mh)):
# for idx in range(10):
for idx in my_mhsa_ids :
mhsa_id = idx
# print("working on MHSA ID %i"%mhsa_id)
query = """
SELECT * FROM agency_data_extraction_sup_info
WHERE mhsa_id = %i;
"""
agency_sup_info = sql.read_sql(query%(mhsa_id), con=conn)
# ref_from_year=agency_sup_info.data_from_year.max()
# ref_to_year=agency_sup_info.data_to_year.min()
# withdrawn_year=agency_sup_info.withdrawn_year.min()
# print(ref_from_year,ref_to_year,withdrawn_year)
agencies=[]
crime_tots=pd.DataFrame() # create a fresh data frame
for agency in agency_sup_info.iterrows():
ref_from_year=agency[1].data_from_year
ref_to_year=agency[1].data_to_year
withdrawn_year=agency[1].withdrawn_year
filename=agency[1].filename
agency_name=agency[1].agency_name
agencies.append(agency_name)
if filename != 'none':
with open('./data/%s'%filename, 'r') as file:
agency_crime = json.load(file)
agency_crime_df=pd.DataFrame(agency_crime['data'])
agency_crime_df['tot_crime'] = agency_crime_df.drop(['data_year'],axis=1).astype(int).sum(axis=1)
df_slice=agency_crime_df[(agency_crime_df['data_year']>=ref_from_year) & (agency_crime_df['data_year']<=ref_to_year)]
crime_tots=pd.concat([crime_tots,df_slice[['data_year','tot_crime']]],axis=0)
data_years=df_slice.data_year.values
#. we are going to assume empty when data for the year is empty the crime count was 0. needs revision later
# else:
# print ( "%s has no data avaliable" %agency_name)
if len(crime_tots) != 0 :
grouped_crime_tots = crime_tots.groupby('data_year').sum() #. this is the collective info we want - this gets around the problem of empty data entries
if withdrawn_year <= grouped_crime_tots.index.max():
group_a=grouped_crime_tots[grouped_crime_tots.index.values<withdrawn_year].tot_crime.values
group_b=grouped_crime_tots[grouped_crime_tots.index.values>=withdrawn_year].tot_crime.values
if group_a.size > 0:
ave_counts_group_a=int(np.mean(group_a))
else:
ave_counts_group_a=0
if group_b.size >0:
ave_counts_group_b=int(np.mean(group_b))
else:
ave_counts_group_b=0
t_statistic, p_value = stats.ttest_ind(group_a, group_b)
mydict= {'mhsa_id': mhsa_id,
'agencies':agencies,
'withdrawn_year': withdrawn_year,
'ave_counts_group_a': ave_counts_group_a,
'ave_counts_group_b': ave_counts_group_b,
'p-value': p_value,
'data_year' : list(grouped_crime_tots.index.values) ,
'tot_crime' : list(grouped_crime_tots.tot_crime.values)}
else:
mydict= {'mhsa_id': mhsa_id,
'agencies':agencies,
'withdrawn_year': withdrawn_year,
'ave_counts_group_a': None,
'ave_counts_group_b': None,
'p-value': None,
'data_year' : list([]) ,
'tot_crime' : list([])}
agency_grouped_crime_data.append(mydict)
agcd= pd.DataFrame(agency_grouped_crime_data)
return agcd