In [75]:
# imports
from google.cloud import spanner
import os
from shapely.geometry import Point
from matplotlib import pyplot as plt
from shapely import geometry
import pandas as pd
import gcsfs
import json
from pprint import pprint
import csv

# fetching database
spanner_client = spanner.Client()
spanner_instance_id = 'tfgen-spanid-20220525101635457'
spanner_database_id = 'metadata'
instance = spanner_client.instance(spanner_instance_id)
database = instance.database(spanner_database_id)

# fetching gcs file system
fs = gcsfs.GCSFileSystem(project='soil-health-card-india')

In [76]:
# available states
state_ids = {'ga': 30, 'mh': 27, 'or': 21, 'rj': 8, 'sk': 11}
num_gjs = {'ga': 1, 'mh': 2, 'or': 2, 'rj': 4, 'sk': 1}

# set the state
curr_state = 'ga'

In [77]:
# open village mappings csv file
with fs.open(f'anthrokrishi-shcs/ShapeFiles/{curr_state}.csv') as file:
  data = pd.read_csv(file)

data = data.values.tolist()
cen_to_vid = {int(x[0]):int(x[1]) for x in data}

In [None]:
# get shape objects of villages

filenames = [f'{curr_state}{x}.geojson' for x in range(1, num_gjs[curr_state] + 1)]
village_shape = {}  # village id: village shape

for filename in filenames:
  with fs.open(f'anthrokrishi-shcs/ShapeFiles/{filename}') as file:
    village_gj_raw = file.read()
    # print(village_gj_raw)
  village_gj = json.loads(village_gj_raw)

  for village in village_gj['features']:
    cen_2001 = int(village['properties']['CEN_2001'])
    village_id = cen_to_vid.get(cen_2001, 0)
    village_shape[village_id] = geometry.shape(village['geometry'])

    x, y = village_shape[village_id].exterior.xy
    plt.plot(x, y)
plt.show()

In [None]:
print(len(village_shape))

In [80]:
# get villageids from spanner
with database.snapshot() as snapshot:
  data = snapshot.execute_sql(
    f"""SELECT villageid, villagename
    FROM villages_view
    WHERE stateid = {state_ids[curr_state]}"""
  )
data = list(data)

village_names = {x[0]: x[1] for x in data}
village_names[0] = curr_state
village_names = dict(sorted(village_names.items()))

In [81]:
print(len(village_names))

433


In [82]:
# get progress stats for state
with database.snapshot() as snapshot:
  data1 = snapshot.execute_sql(
    f"""SELECT villageid, count(*) from Cards where stateid={state_ids[curr_state]} 
    group by 1 order by 1"""
  )

with database.snapshot() as snapshot:
  data2 = snapshot.execute_sql(
    f"""SELECT villageid, count(*) from Cards where stateid={state_ids[curr_state]} 
    and ingested is true
    group by 1 order by 1"""
  )

with database.snapshot() as snapshot:
  data3 = snapshot.execute_sql(
    f"""SELECT villageid, count(*) from Cards_info where stateid={state_ids[curr_state]} 
    group by 1 order by 1"""
  )

with database.snapshot() as snapshot:
  data4 = snapshot.execute_sql(
    f"""SELECT villageid, count(*) from Cards_info where stateid={state_ids[curr_state]}
    and latitude is not null
    group by 1 order by 1"""
  )

data1 = list(data1)
data2 = list(data2)
data3 = list(data3)
data4 = list(data4)

card_total = {x[0]: x[1] for x in data1}
card_scraped = {x[0]: x[1] for x in data2}
card_extracted = {x[0]: x[1] for x in data3}
card_geopos_present = {x[0]: x[1] for x in data4}
card_total[0] = sum([x[1] for x in data1])
card_scraped[0] = sum([x[1] for x in data2])
card_extracted[0] = sum([x[1] for x in data3])
card_geopos_present[0] = sum(x[1] for x in data4)

In [83]:
# for vil in sk_v_geopos.keys():
#   if not sk_shape.get(vil):
#     print(sk_v_geopos[vil])

In [84]:
with database.snapshot() as snapshot:
  points = snapshot.execute_sql(
    f"""SELECT villageid, latitude, longitude
    FROM Cards_info
    WHERE latitude is not null
    AND stateid = {state_ids[curr_state]}
    ORDER BY 1"""
  )

points = list(points)

In [85]:
print(len(points))

1133


In [86]:
# organise
print(f'count: {len(points)}')

points_village = {}
for x in village_names.keys():
  points_village[x] = []

for point in points:
  points_village[point[0]].append(point[1:])

print(len(points_village))

count: 1133
433


In [87]:
paddings = [0,2,5,10,20,50]

in_village = [{k:0 for k in village_names.keys()} for _ in range(len(paddings))]

for k, v in points_village.items():
  if not village_shape.get(k):
    continue
  
  for p in v:
    pnt = Point(p[1], p[0])
    
    if village_shape[k].contains(pnt):
      in_village[0][k] += 1
    else:
      dist = pnt.distance(village_shape[k])*111
      for i in range(1, len(paddings)):
        if dist < paddings[i]:
          in_village[i][k] += 1
          break

  for i in range(1, len(paddings)):
    in_village[i][k] += in_village[i-1][k]
    in_village[i][0] += in_village[i][k]
  in_village[0][0] += in_village[0][k]

In [88]:
# village id, village name, total cards, total scraped, total extracted, total geopos present, 
for v_id in village_names.keys():
  print(f"{v_id}\t{village_names[v_id]}\t{card_total.get(v_id, 0)}\t{card_scraped.get(v_id, 0)}\t{card_extracted.get(v_id, 0)}\t{card_geopos_present.get(v_id, 0)}\t"
  f"{in_village[0][v_id]}\t{in_village[0][v_id]/card_geopos_present.get(v_id, 1)}\t"
  f"{in_village[1][v_id]}\t{in_village[1][v_id]/card_geopos_present.get(v_id, 1)}\t"
  f"{in_village[2][v_id]}\t{in_village[2][v_id]/card_geopos_present.get(v_id, 1)}\t"
  f"{in_village[3][v_id]}\t{in_village[3][v_id]/card_geopos_present.get(v_id, 1)}\t"
  f"{in_village[4][v_id]}\t{in_village[4][v_id]/card_geopos_present.get(v_id, 1)}\t"
  f"{in_village[5][v_id]}\t{in_village[5][v_id]/card_geopos_present.get(v_id, 1)}")

0	ga	5620	2905	2905	1133	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0
252057	Pernem	40	0	0	0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0
252061	Mapusa	48	0	0	0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0
252071	Panaji(Mc)	5	0	0	0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0
252075	Bicholim	150	0	0	0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0
252077	Sanquelim	43	11	11	0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0
252079	Valpoi	235	24	24	14	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0
252082	Ponda	144	4	4	0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0
252084	Mormugao	32	0	0	0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0
252087	Margao	127	0	0	0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0
252095	Cuncolim	111	0	0	0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0
252096	Curchorem Cacora	77	0	0	0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0
252097	Quepem	424	21	21	7	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0
252099	Sanguem	374	316	316	2	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0
626635	Tiracol	0	0	0	0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0
626636	Querim	0	0	0	0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0	0	0.0
626637	Paliem	0