<a href="https://colab.research.google.com/github/MarciaFG/skill-flow/blob/main/Flows_test_2000_2002_FOR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Academic Mobility Flows using BigQuery and Firebase**

Author: Marcia R. Ferreira (Complexity Science Hub Vienna & TU Wien)

Date: September 28, 2022

Input: Dimensions database on BigQuery

Output: Transformed CSV files

Other notes: 

# Colab Initialization

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime → "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Sun Jan  8 16:03:02 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    51W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ),\
       " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB"\
       .format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gputil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-py3-none-any.whl size=7409 sha256=d091ac909429cd836a0dbb22370ca5d36ee8cdc04fdce997340e77f593933fdf
  Stored in directory: /root/.cache/pip/wheels/ba/03/bb/7a97840eb54479b328672e15a536e49dc60da200fb21564d53
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Gen RAM Free: 87.8 GB  | Proc size: 93.3 MB
GPU RAM Free: 40536MB | Used: 0MB | Util   0% | Total 40536MB


In [None]:
# run this to upload files
from google.colab import files
uploaded = files.upload() 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# let's test it
with open('/content/drive/My Drive/foo.txt', 'w') as f:
  f.write('Hello Google Drive!')
!cat /content/drive/My\ Drive/foo.txt

Mounted at /content/drive
Hello Google Drive!

# Install required Drivers

In [3]:
import numpy as np
import requests
import pandas as pd
from tqdm import tqdm
import torch
import nltk
import matplotlib.pyplot as plt
#!pip install pynput
plt.style.use('ggplot')
%matplotlib inline
from google.cloud import bigquery
import humanize

In [4]:
%load_ext google.colab.data_table

In [5]:
# Provide your credentials to the runtime
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


# Download and Load Data from  Google Big Query

### Declare the Cloud project ID which will be used throughout this notebook

In [6]:
# Provide your credentials to the runtime
#from google.colab import auth
#auth.authenticate_user()
#print('Authenticated')
# declare your project ma,e
project_id = "cshdimensionstest"

%load_ext google.cloud.bigquery

# set up parameters eg for a specific journal
bq_params = {}
bq_params["journal_id"] = "jour.1115214"

In [7]:
# test to see if it is working correctly
%%bigquery --params $bq_params --project $project_id 

select distinct 
  journal.id, journal.title, journal.issn, journal.eissn, publisher.name, date_inserted
from `dimensions-ai.data_analytics.publications` 
where  journal.id = @journal_id
and publisher is not null
order by date_inserted desc
limit 1

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,title,issn,eissn,name,date_inserted
0,jour.1115214,Nature Biotechnology,1087-0156,1546-1696,Springer Nature,2023-01-03 19:27:46+00:00


### Ok! it works let's start!

# **1. Extract Dimensions Data from Google BigQuery**
This script extracts test data for Liu.

In [None]:
#@title Hidden cell
%%bigquery --project $project_id 

drop table if exists `cshdimensionstest.test.basic_2000_2002`

In [None]:
# Constructing the mobility flows for the FOR categorization

%%bigquery --project $project_id 

create table cshdimensionstest.test.basic_2000_2002 as 
select   p.id,
         p.year,
         p.date,
         researcher_ids,
         research_orgs,
         category_for.name,
         category_for.id as cat_id
from     `dimensions-ai.data_analytics.publications` p
        left join unnest(p.researcher_ids) researcher_ids
        left join unnest(p.research_orgs) research_orgs
        left join unnest(p.category_for.second_level.FULL) category_for
where    researcher_ids is not null
and      research_orgs is not null
and      category_for.name is not null
and      category_for.id is not null
and      year between 2000 and 2002
order by p.id;

In [None]:
#@title Hidden cell
%%bigquery --project $project_id 
select * from `cshdimensionstest.test.basic_2000_2002`
limit 10


Unnamed: 0,id,year,date,researcher_ids,research_orgs,name,cat_id
0,pub.1000000033,2002,2002-01,ur.013211012560.00,grid.4643.5,Biomedical Engineering,2837
1,pub.1000000033,2002,2002-01,ur.01300514341.04,grid.4643.5,Materials Engineering,2921
2,pub.1000000033,2002,2002-01,ur.0743476073.71,grid.8982.b,Biomedical Engineering,2837
3,pub.1000000033,2002,2002-01,ur.0743476073.71,grid.4643.5,Materials Engineering,2921
4,pub.1000000033,2002,2002-01,ur.01073301374.46,grid.4643.5,Materials Engineering,2921
5,pub.1000000033,2002,2002-01,ur.0671533174.24,grid.4643.5,Materials Engineering,2921
6,pub.1000000033,2002,2002-01,ur.0671533174.24,grid.4643.5,Biomedical Engineering,2837
7,pub.1000000033,2002,2002-01,ur.01164265741.55,grid.8982.b,Materials Engineering,2921
8,pub.1000000033,2002,2002-01,ur.01300514341.04,grid.8982.b,Biomedical Engineering,2837
9,pub.1000000033,2002,2002-01,ur.0671533174.24,grid.8982.b,Materials Engineering,2921


In [None]:
#@title Hidden cell
%%bigquery --project $project_id 
drop table if exists `cshdimensionstest.test.sequence_00_02`

In [None]:
%%bigquery --project $project_id 
# step (1)
# now we need to contruct the trajectories of researchers

 create table cshdimensionstest.test.sequence_00_02 as 
  select 
    distinct researcher_ids, 
    year, 
    dense_rank() over (
      partition by researcher_ids 
      order by 
        year asc
    ) as t 
  from 
    `cshdimensionstest.test.basic_2000_2002` 
  order by 
    researcher_ids, 
    year, 
    t;

In [None]:
#@title Hidden cell
%%bigquery --project $project_id 
drop table if exists `cshdimensionstest.test.affweight_00_02`

In [None]:
%%bigquery --project $project_id 
# step (2)
# generating affiliation weights if the author has had more than one affiliation simultaneously
 create table cshdimensionstest.test.affweight_00_02 as 
  select 
    distinct researcher_ids, 
    id, 
    1 * 1.0 / count(distinct research_orgs) as aff_weight 
  from 
    `cshdimensionstest.test.basic_2000_2002`
  group by 
    researcher_ids, 
    id
  order by researcher_ids, id;

In [None]:
#@title Hidden cell
%%bigquery --project $project_id 
drop table if exists `cshdimensionstest.test.psequence_weight_00_02`

In [None]:
%%bigquery --project $project_id 

# step (3)
# merging results from steps 1-2
# consider using a subquery to combine these 3 steps

create table cshdimensionstest.test.psequence_weight_00_02 as 
  select
      a.researcher_ids,
      a.id,
      a.name,
      a.cat_id,
      a.year,
      a.research_orgs,
      b.t,
      c.aff_weight 
  from
      `cshdimensionstest.test.basic_2000_2002` as a 
      inner join
         `cshdimensionstest.test.sequence_00_02` as b 
         on a.researcher_ids = b.researcher_ids 
         and a.year = b.year 
      inner join
         `cshdimensionstest.test.affweight_00_02` as c 
         on c.researcher_ids = a.researcher_ids 
         and c.id = a.id 
  order by
        b.researcher_ids,
        b.year,
        b.t;

In [8]:
#@title Hidden cell
%%bigquery --project $project_id 
select * from `cshdimensionstest.test.psequence_weight_00_02` limit 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,researcher_ids,id,name,cat_id,year,research_orgs,t,aff_weight
0,ur.015273035727.44,pub.1074658796,Medical Microbiology,3114,2000,grid.507311.1,1,1.0
1,ur.015273042456.93,pub.1064458334,Interdisciplinary Engineering,2953,2002,grid.418677.b,1,0.5
2,ur.015273042456.93,pub.1064458334,Interdisciplinary Engineering,2953,2002,grid.462844.8,1,0.5
3,ur.015273043321.18,pub.1068145884,Interdisciplinary Engineering,2953,2000,grid.43169.39,1,0.25
4,ur.015273043321.18,pub.1068145884,Interdisciplinary Engineering,2953,2000,grid.12981.33,1,0.25
5,ur.015273043321.18,pub.1068145884,Interdisciplinary Engineering,2953,2000,grid.24516.34,1,0.25
6,ur.015273043321.18,pub.1068145884,Interdisciplinary Engineering,2953,2000,grid.177174.3,1,0.25
7,ur.015273043321.18,pub.1004192170,Interdisciplinary Engineering,2953,2001,grid.43169.39,2,0.5
8,ur.015273043321.18,pub.1062069183,Interdisciplinary Engineering,2953,2001,grid.43169.39,2,1.0
9,ur.015273043321.18,pub.1004192170,Interdisciplinary Engineering,2953,2001,grid.16890.36,2,0.5


In [None]:
#@title Hidden cell
%%bigquery --project $project_id 
drop table if exists `cshdimensionstest.test.origin_institution_00_02`

In [None]:
%%bigquery --project $project_id 

# step (4)
# generate the origins and destinations for each researcher
  create  table cshdimensionstest.test.origin_institution_00_02 as 
  select  
  *, 
  case when t = 1 
    then 'origin' 
        else 'destination' 
            end od 
from `cshdimensionstest.test.psequence_weight_00_02`
  order by researcher_ids, year, t;

In [None]:
#@title Hidden cell
%%bigquery --project $project_id 
drop table if exists `cshdimensionstest.test.first_pub_00_02`

In [None]:
%%bigquery --project $project_id 

# step (5)
# getting the first publication of each researcher
   create table cshdimensionstest.test.first_pub_00_02 as 
   select distinct
      a.researcher_ids,
      a.id as pub2,
      a.cat_id as field1,
      a.cat_id as field2,
      a.research_orgs as unit2,
      "0" as t1,
      a.year as p1,
      a.t as t2,
      a.year as p2,
      a.aff_weight,
      'started in' as mobility_type 
   from `cshdimensionstest.test.origin_institution_00_02` a 
   where
      t = 1;

In [None]:
#@title Hidden cell
%%bigquery --project $project_id 
drop table if exists `cshdimensionstest.test.flows_00_02`

In [None]:
%%bigquery --project $project_id 

# step (6)
# now we have everything we need to construct the flows at the institutional level
create table cshdimensionstest.test.flows_00_02 as 
  select
    a.researcher_ids,
    a.id as pub1,
    a.cat_id as field1,
    b.id as pub2,
    b.cat_id as field2,
    a.research_orgs as unit1,
    b.research_orgs as unit2,
    a.t as t1,
    a.year as p1,
    b.t as t2,
    b.year as p2,
    b.aff_weight  
  from
    `cshdimensionstest.test.psequence_weight_00_02` as a 
    inner join
        `cshdimensionstest.test.psequence_weight_00_02` as b 
        on a.researcher_ids = b.researcher_ids 
  where
        a.t < b.t 
    and a.t = b.t - 1 
  order by
    a.researcher_ids,
    a.t,
    b.t;

In [None]:
#@title Hidden cell
%%bigquery --project $project_id 
drop table if exists `cshdimensionstest.test.flows_with_start_00_02`

In [None]:
%%bigquery --project $project_id

# step (7)
# bring the flows and the start publication datasets together and save it in a table

create table cshdimensionstest.test.flows_with_start_00_02 as 
   select
      researcher_ids,
      pub1,
      pub2,
      field1,
      field2,
      unit1,
      unit2,
     cast(t1 as int) t1,
     cast(p1 as int) p1,
      t2,
      p2,
      aff_weight,
      case
         when
            unit1 = unit2 
         then
            'stayed in' 
         else
            case
               when
                  unit1 != unit2 
               then
                  'moved to' 
               else
                  'error' 
            end
      end
      as mobility_type 
   from
      `cshdimensionstest.test.flows_00_02` 
   union all
   select
      researcher_ids,
     'void' as pub1,
      pub2,
      field1,
      field2,
     'void' as unit1,
      unit2,
     cast(t1 as int) t1,
     cast(p1 as int) p1,
      t2,
      p2,
      aff_weight,
      mobility_type 
   from
      `cshdimensionstest.test.first_pub_00_02` 
   order by
      researcher_ids,
      t2;

In [None]:
#@title Hidden cell
%%bigquery --project $project_id 
select * from `cshdimensionstest.test.flows_with_start_00_02` where unit1 is null limit 10
# good there are no nulls in the data

In [None]:
#@title Hidden cell
%%bigquery --project $project_id

select * from `cshdimensionstest.test.flows_with_start_00_02` 
order by researcher_ids, p1, p2
limit 10 

Unnamed: 0,researcher_ids,pub1,pub2,field1,field2,unit1,unit2,t1,p1,t2,p2,aff_weight,mobility_type
0,ur.01000000143.58,void,pub.1055163401,2581,2581,void,grid.17091.3e,0,2000,1,2000,1.0,started in
1,ur.01000000143.58,void,pub.1007920533,2581,2581,void,grid.17091.3e,0,2000,1,2000,0.5,started in
2,ur.01000000143.58,void,pub.1007920533,2581,2581,void,grid.417570.0,0,2000,1,2000,0.5,started in
3,ur.01000000143.58,pub.1007920533,pub.1053189419,2581,2581,grid.417570.0,grid.17089.37,1,2000,2,2001,0.333333,moved to
4,ur.01000000143.58,pub.1007920533,pub.1053189419,2581,2581,grid.17091.3e,grid.17089.37,1,2000,2,2001,0.333333,moved to
5,ur.01000000143.58,pub.1007920533,pub.1053189419,2581,2581,grid.17091.3e,grid.17091.3e,1,2000,2,2001,0.333333,stayed in
6,ur.01000000143.58,pub.1007920533,pub.1053189419,2581,2581,grid.417570.0,grid.31501.36,1,2000,2,2001,0.333333,moved to
7,ur.01000000143.58,pub.1055163401,pub.1053189419,2581,2581,grid.17091.3e,grid.17089.37,1,2000,2,2001,0.333333,moved to
8,ur.01000000143.58,pub.1055163401,pub.1053189419,2581,2581,grid.17091.3e,grid.31501.36,1,2000,2,2001,0.333333,moved to
9,ur.01000000143.58,pub.1007920533,pub.1053189419,2581,2581,grid.17091.3e,grid.31501.36,1,2000,2,2001,0.333333,moved to


In [None]:
%%bigquery --project $project_id

create table cshdimensionstest.test.aggregated_moved_to_00_02 as
select  unit1 as geoid_o
      , unit2 as geoid_d
      , field1 as catid_o
      , field2 as catid_d
      , p1 as date_o
      , p2 as date_d
      , '2000-2002' as date_range
      , sum(aff_weight) as weighted_flows
      , count(researcher_ids) as flows
from `cshdimensionstest.test.flows_with_start_00_02` 
where mobility_type = 'moved to'
group by unit1,unit2, field1,field2,  p1,p2

In [None]:
%%bigquery --project $project_id
select * from `cshdimensionstest.test.aggregated_moved_to_00_02`
order by geoid_o, date_o, date_d, catid_o
limit 10

Unnamed: 0,geoid_o,geoid_d,catid_o,catid_d,date_o,date_d,date_range,weighted_flows,flows
0,grid.1001.0,grid.28312.3a,2330,2921,2000,2001,2000-2002,0.333333,1
1,grid.1001.0,grid.11355.33,2330,2409,2000,2001,2000-2002,3.166667,8
2,grid.1001.0,grid.6612.3,2330,2389,2000,2001,2000-2002,0.5,1
3,grid.1001.0,grid.4991.5,2330,2330,2000,2001,2000-2002,1.666667,3
4,grid.1001.0,grid.1003.2,2330,2344,2000,2001,2000-2002,0.5,1
5,grid.1001.0,grid.1005.4,2330,2353,2000,2001,2000-2002,0.5,1
6,grid.1001.0,grid.483427.e,2330,2409,2000,2001,2000-2002,0.666667,2
7,grid.1001.0,grid.425004.7,2330,2409,2000,2001,2000-2002,0.5,1
8,grid.1001.0,grid.1022.1,2330,2953,2000,2001,2000-2002,2.0,4
9,grid.1001.0,grid.37172.30,2330,2921,2000,2001,2000-2002,0.333333,1


In [None]:
from google.cloud import bigquery

client = bigquery.Client(project=project_id)

sql = """
  SELECT *
  FROM `cshdimensionstest.test.aggregated_moved_to_00_02` 
  order by geoid_o, date_o, date_d, catid_o
"""
movedto_edges = client.query(sql).to_dataframe()
movedto_edges.head(10)

Unnamed: 0,geoid_o,geoid_d,catid_o,catid_d,date_o,date_d,date_range,weighted_flows,flows
0,grid.1001.0,grid.5292.c,2330,2344,2000,2001,2000-2002,0.333333,1
1,grid.1001.0,grid.97008.36,2330,2953,2000,2001,2000-2002,0.833333,2
2,grid.1001.0,grid.34421.30,2330,2401,2000,2001,2000-2002,1.833333,3
3,grid.1001.0,grid.49100.3c,2330,2867,2000,2001,2000-2002,0.5,2
4,grid.1001.0,grid.418228.5,2330,2389,2000,2001,2000-2002,0.25,1
5,grid.1001.0,grid.1019.9,2330,2867,2000,2001,2000-2002,0.333333,1
6,grid.1001.0,grid.11762.33,2330,2330,2000,2001,2000-2002,0.333333,1
7,grid.1001.0,grid.251924.9,2330,2933,2000,2001,2000-2002,0.5,1
8,grid.1001.0,grid.5596.f,2330,2867,2000,2001,2000-2002,0.5,1
9,grid.1001.0,grid.9619.7,2330,2330,2000,2001,2000-2002,2.5,5


In [None]:
from google.colab import files
movedto_edges.to_csv('movedto_edges.csv')
files.download('movedto_edges.csv')

Mounted at /content/gdrive


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Compute Indicators**

**For each institution id and year we comput the following basic indicators:**


1.   Institution id
2.   Year
3. pcp
4. workforce
5. net mobility
6. avg academic age
7. total author inflow
8. total author outflow

In [None]:
%%bigquery --project $project_id

# calculating outflows for each institution
create table cshdimensionstest.test.total_outflows_00_02 as
  select  geoid_o   # sending instution
        , catid_o   # sending field
        , date_d    # sending year: we consider the destination date and the sending date
        , date_range
        , sum(flows) as t_outflows   
        , sum(weighted_flows)  as t_weightedOutflows 
  from `cshdimensionstest.test.aggregated_moved_to_00_02`
  group by geoid_o, catid_o, date_d, date_range

In [None]:
%%bigquery --project $project_id

# calculating inflows for each institution
create table cshdimensionstest.test.total_inflows_00_02 as
  select  geoid_d    # receiving instution
        , catid_d    # receiving field
        , date_d     # receiving year: we consider the destination date and the sending date
        , date_range
        , sum(flows) as inflows   
        , sum(weighted_flows)  as weightedInflows
  from `cshdimensionstest.test.aggregated_moved_to_00_02`
  group by geoid_d, catid_d, date_d, date_range


In [11]:
from google.cloud import bigquery
client = bigquery.Client(project=project_id)

sql = """
  SELECT *
  FROM `cshdimensionstest.test.total_inflows_00_02` 
"""

inflows = client.query(sql).to_dataframe()

#inflows.to_csv('inflows.csv')
#!cp inflows.csv "gdrive/My Drive/CSH-DIMENSIONS Flows Test/BigQuery-results"

#from google.cloud import bigquery
client = bigquery.Client(project=project_id)

sql = """
  SELECT *
  FROM `cshdimensionstest.test.total_outflows_00_02` 
"""
outflows = client.query(sql).to_dataframe()

#from google.colab import files
#files.download('inflows.csv')
#outflows.to_csv('outflows.csv')
#!cp inflows.csv "gdrive/My Drive/CSH-DIMENSIONS Flows Test/BigQuery-results"
#from google.colab import files
#files.download('outflows.csv')

In [None]:
#@title Hidden Cell
inflows.sort_values(["geoid_d", "catid_d", "date_d"]).head(10)

Unnamed: 0,geoid_d,catid_d,date_d,date_range,inflows,weightedInflows
4666,grid.1001.0,2330,2001,2000-2002,208,107.833333
4759,grid.1001.0,2330,2002,2000-2002,234,162.666667
7252,grid.1001.0,2344,2001,2000-2002,537,272.583333
9142,grid.1001.0,2344,2002,2000-2002,419,241.666667
12230,grid.1001.0,2353,2001,2000-2002,96,63.5
11395,grid.1001.0,2353,2002,2000-2002,29,10.0
16908,grid.1001.0,2358,2001,2000-2002,212,150.5
16441,grid.1001.0,2358,2002,2000-2002,380,159.916667
18109,grid.1001.0,2366,2002,2000-2002,43,37.0
20890,grid.1001.0,2377,2001,2000-2002,19262,2502.594061


In [None]:
#@title Hidden Cell
outflows.sort_values(["geoid_o", "catid_o", "date_d"]).head(10)

Unnamed: 0,geoid_o,catid_o,date_d,date_range,t_outflows,t_weightedOutflows
4761,grid.1001.0,2330,2001,2000-2002,501,243.75
3223,grid.1001.0,2330,2002,2000-2002,311,154.666667
9245,grid.1001.0,2344,2001,2000-2002,498,243.5
9032,grid.1001.0,2344,2002,2000-2002,684,354.423077
11374,grid.1001.0,2353,2001,2000-2002,37,18.5
12302,grid.1001.0,2353,2002,2000-2002,187,75.50641
15729,grid.1001.0,2358,2001,2000-2002,346,181.166667
14254,grid.1001.0,2358,2002,2000-2002,275,113.044872
17733,grid.1001.0,2366,2001,2000-2002,48,20.333333
17388,grid.1001.0,2366,2002,2000-2002,23,9.5


In [12]:
# merge the inflows and outflows dataframe
result = pd.merge(inflows
                  , outflows
                  , how="outer"
                  , left_on=["geoid_d", "catid_d", "date_d"]
                  , right_on=["geoid_o", "catid_o", "date_d"]
                  ).reset_index(drop = True)
def diff(a, b):
    return b - a

result["net_mobility"] = result['inflows'] - result['t_outflows']
result["weighted_net_mobility"] = result['weightedInflows'] - result['t_weightedOutflows']
#result.sort_values(["geoid_o", "catid_o", "date_d"]).head(10)
flow_ind = result.rename(columns = {'date_d': 'MoveYear'
                         , ' t_ouflows': 'outflows' 
                         , 't_weightedOutflows': 'weightedOutflows'
                         , 'date_range_x':'Range'
                         , 'net_mobility':'NetFlows'
                         , 'weighted_net_mobility': 'WeightedNetFlows'}) \
                         [[  'geoid_d', 'catid_d', 'inflows', 'weightedInflows'\
                           , 'geoid_o', 'catid_o', 't_outflows', 'weightedOutflows'\
                           , 'NetFlows', 'WeightedNetFlows', 'MoveYear', 'Range']]

# save the indicators to a csv file
#flow_ind.to_csv('Flows_indicators.csv')
#files.download('Flows_indicators.csv')
flow_ind.sort_values(["geoid_o", "catid_o", "MoveYear"]).head(10)

# store dataset directly into GBQ and DRIVE
# store in drive
flow_ind.to_csv('2023_01_08_flows_output.csv', encoding = 'utf-8-sig') 

# store in GBQ
# import pandas_gbq

# table_id = 'test.2023_01_08_flows_output'
# pandas_gbq.to_gbq(flow_ind, table_id, project_id=project_id)


# **Improvements to the code:**


*   Load data direclty into google cloud storage bucket in CSV
*   Optimize and simplify queries



1. **Make a hello world program**
1. **Connect resources to each other:**
 e.g., can I print the GBQ data in a website (print=show any table) for instance?
1. **Other considerations**
* how to run queries fast enough (users should not have delays)
* how does the interface look like
* how to put all calculations in one query?
* how to connect the web interface to google bigquery?
* what if multiple users use it? performance?


# **3. Set up APACHE CASSANDRA** 

Before you can continue, you need to set up a project:


In [None]:
!pip install cassandra-driver
import cassandra

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cassandra-driver
  Downloading cassandra_driver-3.25.0-cp37-cp37m-manylinux1_x86_64.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 4.7 MB/s 
[?25hCollecting geomet<0.3,>=0.1
  Downloading geomet-0.2.1.post1-py3-none-any.whl (18 kB)
Installing collected packages: geomet, cassandra-driver
Successfully installed cassandra-driver-3.25.0 geomet-0.2.1.post1


### Creating a connection

In [None]:
!mkdir python_project
!cd python_project
!touch connect_database.py

In [None]:
!ls 

connect_database.py  drive  python_project  sample_data


In [None]:
# Copy the following connection code into the connect_database.py file:
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider

cloud_config= {
         'secure_connect_bundle': '/content/drive/My Drive/secure-connect-skill-flow.zip'
}
auth_provider = PlainTextAuthProvider('uWQNPwZycRyUUgZQIRUmZhtd', 'rfmlT9nxXOoxQi-YIKuG0Nx2LOdJrPik7NjIHq19epxoEQFZAi89sxyM-uBCtkwz3MxugU2j.9Edwdp4JlqCn7m3lcxd,UTQONJez7xR_shJ-9+mnWs-ouDDTuWN13+7')
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session = cluster.connect()

row = session.execute("select release_version from system.local").one()
if row:
      print(row[0])
else:
      print("An error occurred.")

ERROR:cassandra.connection:Closing connection <LibevConnection(140555243238096) 165fae4e-ef55-40b9-aece-d0fe90662c4d-europe-west1.db.astra.datastax.com:29042:4d87acac-9a17-4d9f-bc9e-fabb707432d7> due to protocol error: Error from server: code=000a [Protocol error] message="Beta version of the protocol used (5/v5-beta), but USE_BETA flag is unset"


4.0.0.6816


In [None]:
#Run connect_database.py:
!python ./connect_database.py

# **4. Set up Mobility Webtool and Connection to CSH Server**