In [2]:
#setup
import json
import sqlite3
import re

In [3]:
# initiate sql tables
conn = sqlite3.connect('gsi.sqlite')
cur = conn.cursor()

cur.executescript('''
DROP TABLE IF EXISTS Projects;
DROP TABLE IF EXISTS Retrofits;
DROP TABLE IF EXISTS Joint;
DROP TABLE IF EXISTS Grants;
CREATE TABLE Projects (
    id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
    pname TEXT UNIQUE,
    grant_id INTEGER,
    grant_amount INTEGER,
    approve_yr INTEGER
);
CREATE TABLE Retrofits (
    id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
    rname TEXT UNIQUE
);
CREATE TABLE Grants (
    id INTEGER NOT NULL PRIMARY KEY UNIQUE,
    gname TEXT UNIQUE
);
CREATE TABLE Joint (
    number INTEGER,
    project_id INTEGER,
    retro_id INTEGER,
    UNIQUE (project_id, retro_id)
)''')

grants = [(0,'SMIP'),(1,'GARP')]
cur.executemany('INSERT INTO Grants (id, gname) VALUES (?, ?)', grants)

fname = input('Enter file name: ')
if len(fname) < 1 : fname = 'GSI_Private_Projects_Retrofit.geojson'

with open(fname) as f :
    dict_data = json.loads(f.read())


for f in dict_data['features'] :
     project = f['properties']

     #if project['NAME'] != project['PROJECTNAME'] : print(project['NAME'], project['PROJECTNAME'])
     if project['APPROVALDATE'] is None : continue

     #handle missing name fields
     if project['NAME'] is not None : pname = project['NAME']
     elif project['PROJECTNAME'] is not None : pname = project['PROJECTNAME']
     else :
         print('Error in Record: ', project['TRACKINGNUMBER'])
         continue
     grant_amount = project['GRANTAMOUNT']
     #parse approval year
     approve_date = project['APPROVALDATE']
     approve_yr = int(approve_date[:4])
     #get retrofit names from json keys
     kl = list(project.keys())
     rname = kl[11:]
     # get grant values
     if project['SMIP'] == -1 : grant_id = 0
     elif project['GARP'] == -1 : grant_id = 1
     else : grant_id = None

     for i in range(len(rname)) :
          rname[i] = (rname[i],) # make rname into a list of tuples for executemany argument

     cur.execute('''INSERT OR IGNORE INTO Projects (pname, grant_id, grant_amount, approve_yr)
     VALUES (?, ?, ?, ?)''', (pname, grant_id, grant_amount, approve_yr))
     cur.execute('SELECT id FROM Projects WHERE pname = ?', (pname,))
     project_id = cur.fetchone()[0]

     stmt = 'INSERT OR IGNORE INTO Retrofits (rname) VALUES (?)'
     cur.executemany(stmt, rname)

     rstr = kl[11:] #get retrofit names as list of strings to loop through
     for i in range(len(rstr)) :
         if project[rstr[i]] > 0 :
             number = project[rstr[i]]
             cur.execute('SELECT id FROM Retrofits WHERE rname = ?', rname[i])
             retro_id = cur.fetchone()[0]

             #create many-to-many join table
             cur.execute('''INSERT OR REPLACE INTO Joint (number, project_id, retro_id)
             VALUES (?, ?, ?)''', (number, project_id, retro_id))
         else : continue

conn.commit()


Enter file name: 


In [7]:
sqlstmt = '''SELECT Grants.gname, SUM(Projects.grant_amount), COUNT(Projects.pname)
FROM Grants JOIN Projects ON Grants.id = Projects.grant_id
GROUP BY Grants.gname'''

for row in cur.execute(sqlstmt) :
    print('For %s grant: %d dollars allocated to %d projects' % (row[0], row[1], row[2]))

For GARP grant: 39753820 dollars allocated to 57 projects
For SMIP grant: 28774888 dollars allocated to 74 projects


In [8]:
#Same same but different: this is the notation I tend to run into (tables as A,B)
# in an interview they're likely to use it as it tests familiarity. ugh.
sqlstmt = '''SELECT A.gname, SUM(B.grant_amount), COUNT(B.pname)
FROM Grants A JOIN Projects B ON A.id = B.grant_id
GROUP BY A.gname'''

for row in cur.execute(sqlstmt) :
    print('For %s grant: %d dollars allocated to %d projects' % (row[0], row[1], row[2]))

For GARP grant: 39753820 dollars allocated to 57 projects
For SMIP grant: 28774888 dollars allocated to 74 projects


In [14]:
import pandas as pd

In [16]:
#I want to SEE the data, et an idea of structure, etc
query = 'SELECT * FROM GRANTS LIMIT 10'
df = pd.read_sql_query(query, conn)
df.head(10)
# so only 2 rows, 2 cols

Unnamed: 0,id,gname
0,0,SMIP
1,1,GARP


In [41]:
#I want to SEE the data, et an idea of structure, etc
query = 'SELECT * FROM PROJECTS'
df = pd.read_sql_query(query, conn)
print(df.head(10))
# many rols, cols
print(df.describe())
len(df) # 155 rows

   id                                        pname  grant_id  grant_amount  \
0   1                             Weavers Way Coop       NaN           NaN   
1   2                               CATCH Retrofit       NaN           NaN   
2   3                     Pennypack Woods Retrofit       NaN           NaN   
3   4                     Community Legal Services       NaN           NaN   
4   5                  TVPV Stormwater Credit App.       NaN           NaN   
5   6                           2800 Black Lake Pl       NaN           NaN   
6   7  1518 Cambridge Street to 1521 Poplar Street       NaN           NaN   
7   8       Lafayette Redeemer Stormwater Retrofit       NaN           NaN   
8   9                       2150 E Westmoreland St       NaN           NaN   
9  10           St James Episcopal Church Retrofit       NaN           NaN   

   approve_yr  
0        2011  
1        2010  
2        2010  
3        2013  
4        2011  
5        2011  
6        2015  
7        2011

155

In [46]:
#I want to look at grants with amounts in them
df.dropna(thresh = 2) # drop rows with 2 NA values (id, amount)
# that didn't pick up 'NaN's...
# this method is better anyway, allows me to specify the column
df[df['grant_amount']>0]

Unnamed: 0,id,pname,grant_id,grant_amount,approve_yr
13,14,Pennypack Woods Homeowners Association Stormwater,0.0,135500.0,2013
16,17,13410- 13420 Damar Drive,0.0,255000.0,2014
17,18,"GSFS, Green Street Friends School Retrofit",0.0,91080.0,2013
18,19,1148 Wharton Street,0.0,79500.0,2014
19,20,Methodist Home Rain Gardens,0.0,70000.0,2013
...,...,...,...,...,...
147,150,PEER GARP/OVERBROOK PRESBYTERIAN CHURCH STORMW...,1.0,475525.0,2019
148,152,2001 W Lehigh Ave - Site 240,1.0,1123200.0,2019
149,153,2230 Castor Avenue,1.0,1555000.0,2019
152,156,Eastern State Penitentiary - Phase I,0.0,117750.0,2019


# How much money was given out by each grant program?

In [47]:
#How much money was given out by each grant program?
sqlstmt = '''SELECT Grants.gname, SUM(Projects.grant_amount), COUNT(Projects.pname)
FROM Grants JOIN Projects ON Grants.id = Projects.grant_id
GROUP BY Grants.gname'''

for row in cur.execute(sqlstmt) :
    print('For %s grant: %d dollars allocated to %d projects' % (row[0], row[1], row[2]))


For GARP grant: 39753820 dollars allocated to 57 projects
For SMIP grant: 28774888 dollars allocated to 74 projects


In [None]:
# I want to replace grant_id (or add new column) with program name