<a href="https://colab.research.google.com/github/digital-science/dimensions-api-lab/blob/master/3-workshops/2019-04-Technical-University-of-Denmark/10-Industry.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open Dimensions API Lab In Google Colab"/></a>

## Using GRID and Dimensions together to identify Industry collaboration


Because Dimensions uses external identifiers for institutions, you can integrate external data from GRID.ac with Dimensions queries.

In this example we download the GRID dataset, and identify all institutions that have an Industry type. The list of industry types is then used to identify industry collaboration within an institution 

In [None]:
import pandas as pd
from dimcli.shortcuts import dslquery_json as dslquery
import zipfile as zf
import io
import requests

## First download the GRID dataset, and extract the types.csv file
you can get the GRID dataset from here: https://grid.ac/downloads
Once unzipped, put the types.csv file in your notebook directory...

In [None]:
#grid_types = pd.read_csv("types.csv")

## You can also download the latest release directly from Figshare...


In [None]:
grid_versions = pd.DataFrame(
    requests.get("https://api.figshare.com/v2/collections/3812929/articles").json()
).sort_values('published_date', ascending=False)

grid_versions.head(2)

In [None]:
grid_download_url = requests.get(grid_versions.loc[1]['url_public_api']).json()['files'][0]['download_url']

with zf.ZipFile(
                io.BytesIO(requests.get(grid_download_url).content)
                ) as thezip:
    
    grid_types = pd.read_csv(thezip.open('full_tables/types.csv'))

In [None]:
grid_types.head(40)

In [None]:
company_grids = grid_types.loc[grid_types['type'] == 'Company']

In [None]:
company_grids.head()

In [None]:
grids = list(company_grids.grid_id.unique())
print(len(grids))

In [None]:
institution = 'grid.147455.6'

In [None]:
def publicationsfromgrid(grids,limit=1000,skip=0):
    searchstring = """
    search publications
       where
        research_orgs.id = "{}"
        and research_orgs.id in [{}]
        and year > "2007"
    return publications[id+doi+times_cited+year+author_affiliations] limit {} skip {}
    """.format(institution,",".join([ '"{}"'.format(g) for g in grids]),limit,skip)
    print (searchstring)
    return searchstring

In [None]:
def dslsearchpublications(grids):
    skip = 0
    pubs = []
    total_pubs = []
    result = {}
    while (skip == 0) or (len(pubs) == 1000):
        pubs = dslquery(publicationsfromgrid(grids,skip=skip)).get('publications',[])
        total_pubs += pubs
        skip += 1000      
                            
    return total_pubs

In [None]:
def publicationsfromgridlist(grids):
    pubs=[]
    
    idchunks = [grids[x:x + 499] for x in range(0, len(grids), 499)]
    for ids in idchunks:
        pubs += dslsearchpublications(ids)
        print(len(pubs))
        
    return pubs

In [None]:
industry_pubs = publicationsfromgridlist(grids)

In [None]:
len(industry_pubs)

# Citations from Industry Collaboration

In [None]:
pd.DataFrame(industry_pubs).groupby(['year']).sum().plot(kind='bar')

In [None]:
pd.DataFrame(industry_pubs)[['doi','year']]. \
    groupby(['year']). \
    count(). \
    plot(kind='bar')

In [None]:
industry_aff = [dict( 
       year = p['year'],
       pubid = p['id'],
       grid = aff['id'],
       inst = aff['name']
    
     )
    for p in industry_pubs
    for auth in p.get('author_affiliations', [{}])[0]
    for aff in auth.get('affiliations',[])
       if aff.get('id','') in grids
      
]



In [None]:
idf = pd.DataFrame(industry_aff).drop_duplicates()

In [None]:
idf[['inst','grid','pubid']]. \
    groupby(['inst','grid']). \
    count(). \
    reset_index(). \
    sort_values(by='pubid', ascending=False)

---
# Want to learn more?

Check out the [Dimensions API Lab](https://digital-science.github.io/dimensions-api-lab/) website, which contains many tutorials and reusable Jupyter notebooks for scholarly data analytics. 