In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import re
import pandas as pd
import numpy as np
import os
import dill
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from datetime import date

In [2]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [3]:
def pull_top_needs(text):
    pattern='three most[A-Za-z\d\n\. ]+:[\n•]+([A-Za-z\(\)\-&,’\. ]+)[\n•]+([A-Za-z\(\)\-&,’\. ]+)[\n•]+([A-Za-z\(\)\-&,’\. ]+)'
    return re.findall(pattern,text)

In [4]:
needs=dict()
path=r'C:\Users\Gokmen Oz\SeamSocialLabs\static\statements'
for boro_year in os.listdir(path):
    boro=boro_year[:2]
    year=int(boro_year[-4:])
    if year>2016:
        for dist in os.listdir(path+'\\'+boro_year):
            top3=pull_top_needs(convert_pdf_to_txt(path+'\\'+boro_year+'\\'+dist))
            if boro+dist[-6:-4] in needs.keys():
                if top3 is not None:
                    needs[boro+dist[-6:-4]].extend(top3)
                else:
                    needs[boro+dist[-6:-4]].extend([(0,0,0)])
            else:
                if top3 is not None:
                    needs[boro+dist[-6:-4]]=top3
                else:
                    needs[boro+dist[-6:-4]].extend([(0,0,0)])

KeyboardInterrupt: 

In [2]:
dill.dump(needs,open('static/needs.pkd','wb'))

NameError: name 'needs' is not defined

In [5]:
needs=dill.load(open('static/needs.pkd','rb'))

In [6]:
dist_needs=['Affordable housing','Neighborhood preservation','Traffic','Air Quality','Business opportunities',
           'Commercial development','Connecting local residents to the jobs that are being created','Crime',
           'Cultural facilities','Emergency response','Health care services','Parks','Police-community relations',
           'resiliency infrastructure','Public health','Quality of life issues','Resiliency','Schools','Senior services',
           'Social services','Street conditions','Street flooding','Transit','Trash removal','Unemployment',
           'Youth and children’s services']
dill.dump(dist_needs,open('static/dist_needs.pkd','wb'))
dist_needs=dill.load(open('static/dist_needs.pkd','rb'))

In [7]:
needs_encoded=needs
for col in needs.columns:
    for j in range(len(needs[col])):
        temp=[0]*26
        for need in dist_needs:
            for (i,n) in enumerate(needs[col][j]):
                if need in n:
                    temp[dist_needs.index(need)]=1
        needs_encoded[col][j]=temp

In [86]:
dill.dump(needs_encoded,open('static/needs_encoded.pkd','wb'))

In [8]:
needs_encoded=dill.load(open('static/needs_encoded.pkd','rb'))

In [9]:
districtgeometries=dill.load(open('static/districtgeometries.pkd','rb'))

In [10]:
districtgeometries=pd.DataFrame(districtgeometries)

In [11]:
needs_encoded=needs_encoded.T

In [12]:
d={1:'MN',2:'BX',3:'BK',4:'QN',5:'SI'}
districtgeometries['cd']=districtgeometries['cd'].astype(str)
for i in range(1,21):
    for j in range(1,6):
        districtgeometries=districtgeometries.replace(str(100*j+i),d[j]+"{:02d}".format(i))

In [13]:
districtgeometries=districtgeometries.groupby('cd').aggregate({'lat':'sum','lon':'sum'})

In [14]:
for i in districtgeometries.index:
    districtgeometries.loc[i,'lat']=sum(districtgeometries.loc[i,'lat'])/len(districtgeometries.loc[i,'lat'])
    districtgeometries.loc[i,'lon']=sum(districtgeometries.loc[i,'lon'])/len(districtgeometries.loc[i,'lon'])


In [15]:
lat_mean=districtgeometries['lat'].mean()
lon_mean=districtgeometries['lon'].mean()
lat_std=districtgeometries['lat'].std()
lon_std=districtgeometries['lon'].std()
districtgeometries['lat']=(districtgeometries['lat']-lat_mean)/lat_std
districtgeometries['lon']=(districtgeometries['lon']-lon_mean)/lon_std

In [16]:
X_train=districtgeometries.join(needs_encoded).dropna()

In [150]:
dill.dump(X_train,open('static/X_train.pkd','wb'))

In [27]:
X_train=dill.load(open('static/X_train.pkd','rb'))

In [29]:
X=X_train.iloc[:,:-1].values
Y=X_train.iloc[:,-1:].values

In [30]:
from functools import reduce

def extend(x,y):
    if type(x)!=list:
        x=[x]
    if type(y)!=list:
        y=[y]
    return x+y

def convert_from_array_to_list(X):
    res=[]
    for x in X:
        res.append(list(reduce(extend,list(x))))
    return res

X_list=convert_from_array_to_list(X)
Y_list=convert_from_array_to_list(Y)

In [32]:
classifier=KNeighborsRegressor(n_neighbors=5)
classifier.fit(X_list,Y_list)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform')

In [33]:
X_train=X_train.drop(['2017'],axis=1)
X_list=convert_from_array_to_list(X_train.values)
for y in range(2021,2025):
    Y_new=classifier.predict(X_list)
    X_train=X_train.drop([str(y-3)],axis=1)
    X_train[str(y)]=convert_from_array_to_list(Y_new)
    X_list=convert_from_array_to_list(X_train.values)
Y_new

array([[0.8, 0.2, 0. , ..., 0. , 0. , 0. ],
       [1. , 0. , 0. , ..., 0.2, 0. , 0. ],
       [1. , 0. , 0. , ..., 0.2, 0. , 0. ],
       ...,
       [0.2, 0. , 0.8, ..., 0. , 0. , 0. ],
       [0.2, 0. , 0.8, ..., 0. , 0. , 0. ],
       [0.2, 0. , 0.8, ..., 0. , 0. , 0. ]])

In [23]:
for cd in X_train.index.values:
    X_train.loc[cd]['2024']=[dist_needs[i] for i in np.argsort(X_train.loc[cd]['2024'])[-3:]]

In [251]:
Y_2024=list(X_train['2024'])
dill.dump(Y_2024,open('static/Y_2024.pkd','wb'))
Y_2024=dill.load(open('static/Y_2024.pkd','rb'))

In [281]:
#Y_2024=pd.DataFrame(Y_2024)
Y_2024.cd=list(range(301,304))+list(range(307,319))+list(range(201,212))+list(range(101,113))+list(range(401,415))+list(range(501,504))

In [59]:
y_2024=[np.argsort(np.array(x))[-3:] for x in X_train['2024']]

In [60]:
y=[np.argsort(np.array(y[0]))[-3:] for y in Y]