# Metric Scoping

Reference data from [here](https://data.europa.eu/euodp/data/dataset/cordisref-data).

- [ ] Number of citations
- [ ] Types of publication outputs
- [ ] Access rights
- [ ] Did another collaboration happen afterwards?


- [ ] Public private balance
- [ ] Country diversity
- [ ] Org diversity

## Preamble

In [None]:
%run notebook_preamble.ipy

pd.set_option('max_columns', 99)

In [None]:
import seaborn as sns
from itertools import chain
from collections import Counter, defaultdict
import pickle
import ast
from datetime import datetime
from skbio.diversity.alpha import shannon

from eu_funding.visualization.visualize import pdf_cdf
import seaborn as sns

sns.set_context('notebook')
# from src.visualization.visualize import pdf_cdf

## Data

### Projects

Each funding programme has a projects dataset, which contain research projects funded by EC programmes.
- `rcn`
- `id`
- `acronym`
- `status`
- `programme`
- `topics`
- `frameworkProgramme`
- `title`
- `startDate`
- `endDate`
- `projectUrl`
- `objective`
- `totalCost`
- `ecMaxContribution`
- `call`
- `fundingScheme`
- `coordinator`
- `coordinatorCountry`
- `participants`
- `participantCountries`
- `subjects`

### H2020

In [None]:
h2020_projects_df = pd.read_csv(
    os.path.join(cordis_h2020_path, 'cordis-h2020projects.csv'),
    sep=';',
    encoding='iso-8859-1',
    parse_dates=['startDate', 'endDate'],
    infer_datetime_format=True,
    decimal=','
)
h2020_orgs_df = pd.read_csv(
    os.path.join(cordis_h2020_path, 'cordis-h2020organizations.csv'),
    sep=';',
    encoding='iso-8859-1',
    decimal=',',
)
h2020_reports_df = pd.read_csv(
    os.path.join(cordis_h2020_path, 'cordis-h2020reports.csv'),
)
h2020_pi_df = pd.read_excel(
    os.path.join(cordis_h2020_path, 'cordis-h2020-erc-pi.xlsx'),
    sheet_name='PI'
)
h2020_fellows_r1_df = pd.read_excel(
    os.path.join(cordis_h2020_path, 'cordis-h2020-msca-fellows.xls'),
    sheet_name='Report 1',
    header=3
).set_index('Project Number')
h2020_fellows_r2_df = pd.read_excel(
    os.path.join(cordis_h2020_path, 'cordis-h2020-msca-fellows.xls'),
    sheet_name='Report 2'
).set_index('projectId')
h2020_fellows_df = h2020_fellows_r1_df.join(h2020_fellows_r2_df, how='outer').reset_index().rename(columns={'index': 'projectId'})

In [None]:
h2020_projects_df.head(1)

In [None]:
h2020_projects_df.shape

In [None]:
h2020_reports_df.head(1)

In [None]:
h2020_reports_df.shape

In [None]:
h2020_orgs_df.head(1)

In [None]:
h2020_orgs_df.shape

In [None]:
h2020_pi_df.head(1)

In [None]:
h2020_fellows_df.head(1)

### FP7

In [None]:
fp7_projects_df = pd.read_csv(
    os.path.join(cordis_fp7_path, 'cordis-fp7projects.csv'),
    sep=';',
    encoding='iso-8859-1',
    parse_dates=['startDate', 'endDate'],
    infer_datetime_format=True,
    decimal=','
)
fp7_orgs_df = pd.read_csv(
    os.path.join(cordis_fp7_path, 'cordis-fp7organizations.csv'),
    sep=';',
    encoding='iso-8859-1',
    decimal=',',
)
fp7_reports_df = pd.read_csv(
    os.path.join(cordis_fp7_path, 'cordis-fp7reports.csv'),
).drop('Unnamed: 16', axis=1)

In [None]:
fp7_projects_df.head(1)

In [None]:
fp7_orgs_df.head(1)

In [None]:
fp7_projects_df[fp7_projects_df['id'] == 240271]

### Combining Programme Datasets

In [None]:
cordis_projects_df = pd.concat([h2020_projects_df, fp7_projects_df])
cordis_reports_df = pd.concat([h2020_reports_df, fp7_reports_df])
cordis_orgs_df = pd.concat([h2020_orgs_df, fp7_orgs_df])

In [None]:
def all_particpants(coordinator, participants):
    if pd.isnull(participants):
        return coordinator
    else:
        if coordinator in participants:
            return participants
        else:
            return participants + ';' + coordinator

cordis_projects_df['allCountries'] = cordis_projects_df.apply(
    lambda x: all_particpants(x['coordinatorCountry'], x['participantCountries']),
    axis=1
).fillna('NA')
cordis_projects_df['countries_count'] = [len(s.split(';')) for s in cordis_projects_df['allCountries']]

cordis_projects_df['allParticipants'] = cordis_projects_df.apply(
    lambda x: all_particpants(x['coordinator'], x['participants']),
    axis=1
)
cordis_projects_df['participant_count'] = [len(s.split(';')) for s in cordis_projects_df['allParticipants']]

### Reference Data

#### Activity Types

In [None]:
cordis_activity_ref_df = pd.read_csv(
    os.path.join(cordis_ref_path, 'cordisref-organizationActivityType.csv'),
    sep=';'
)

In [None]:
cordis_activity_ref_df

#### Countries

In [None]:
cordis_countries_ref_df = pd.read_csv(
    os.path.join(cordis_ref_path, 'cordisref-countries.csv'),
    sep=';'
)

In [None]:
cordis_countries_ref_df.head(10)

#### Topics

In [None]:
cordis_topics_ref_df = pd.read_csv(
    os.path.join(cordis_ref_path, 'cordisref-H2020topics.csv'),
    sep=';'
)

In [None]:
cordis_topics_ref_df.head()

#### H2020 Programmes

In [None]:
cordis_h2020_programmes_ref_df = pd.read_csv(
    os.path.join(cordis_ref_path, 'cordisref-H2020programmes.csv'),
#     sep=';'
)

In [None]:
cordis_h2020_programmes_ref_df.head()

#### Funding Schemes

In [None]:
cordis_funding_schemes_ref_df = pd.read_csv(
    os.path.join(cordis_ref_path, 'cordisref-projectFundingSchemeCategory.csv'),
    sep=';'
)

In [None]:
cordis_funding_schemes_ref_df.head()

#### SIC Codes

In [None]:
cordis_sic_codes_ref_df = pd.read_csv(
    os.path.join(cordis_ref_path, 'cordisref-sicCode.csv'),
    sep=';'
)

In [None]:
cordis_sic_codes_ref_df[cordis_sic_codes_ref_df['language'] == 'en'].head()

### OpenAIRE

In [None]:
list_cols = ['ec_project_codes', 'categories', 'children',
             'field_names', 'institutes', 'reference', 'authors_parsed', 
            'container-title'
            ]

openaire_publications_df = pd.read_csv(
    os.path.join(inter_data_path, 'openaire_publications_20192302.csv'),
    converters={k: ast.literal_eval for k in list_cols},
#     chunksize=5000
)


In [None]:
openaire_publications_df.head()

In [None]:
openaire_publications_df['pub_id'] = openaire_publications_df.index

In [None]:
openaire_publications_df['n_projects'] = [len(s) for s in openaire_publications_df['ec_project_codes']]
openaire_publications_df['project_contribution'] = 1 / openaire_publications_df['n_projects']

In [None]:
def parse_date_parts(date_dict):
    if not pd.isnull(date_dict):
        date_parts = date_dict.get('date-parts')
        if date_parts is not None:
            date_parts = date_parts[0]
            if date_parts[0] is not None:
                fill = 3 - len(date_parts)
                for _ in range(fill):
                    date_parts.append(1)
                dt = datetime(*date_parts)
                return dt
    return None

In [None]:
openaire_publications_df['date'] = pd.to_datetime(openaire_publications_df['date'])
openaire_publications_df['published-online'].fillna({}, inplace=True)
openaire_publications_df['published-print'].fillna({}, inplace=True)
openaire_publications_df['created'].fillna({}, inplace=True)

openaire_publications_df['published-online'] = [ast.literal_eval(a) if type(a) == str else np.nan 
                        for a in openaire_publications_df['published-online']]
openaire_publications_df['published_online_dt'] = openaire_publications_df['published-online'].apply(lambda x: parse_date_parts(x))
openaire_publications_df['published-print'] = [ast.literal_eval(a) if type(a) == str else np.nan 
                       for a in openaire_publications_df['published-print']]
openaire_publications_df['published_print_dt'] = openaire_publications_df['published-print'].apply(lambda x: parse_date_parts(x))
openaire_publications_df['created'] = [ast.literal_eval(a) if type(a) == str else np.nan 
               for a in openaire_publications_df['created']]
openaire_publications_df['created_dt'] = openaire_publications_df['created'].apply(lambda x: parse_date_parts(x))

openaire_publications_df['date_all'] = openaire_publications_df['date'].fillna(
    openaire_publications_df['published_online_dt']
).fillna(openaire_publications_df['published_print_dt']).fillna(
    openaire_publications_df['created_dt']
)

collected_date = datetime(2019, 2, 1)
openaire_publications_df['age'] = collected_date - openaire_publications_df['date_all']
openaire_publications_df['age_years'] = openaire_publications_df['age'].dt.days / 365.25

openaire_publications_df['authors_list'] = openaire_publications_df['authors'].str.replace('; ;',';').str.split(';')
openaire_publications_df['n_authors_oa'] = [len(a) if type(a) == list else np.nan for a in openaire_publications_df['authors_list']]
openaire_publications_df['n_authors_mag'] = [len(a) if len(a) > 0 else np.nan for a in openaire_publications_df['authors_parsed']]
openaire_publications_df['n_authors'] = openaire_publications_df['n_authors_oa'].fillna(openaire_publications_df['n_authors_mag'])
openaire_publications_df.drop(['n_authors_oa', 'n_authors_mag'], axis=1, inplace=True)

In [None]:
def clean_authors(authors):
    authors_rm = ['', 'et al.', '#n/d', 'et al']
    cleaned = []
    if type(authors) == list:
        for author in authors:
            if author in authors_rm:
                continue
            else:
                cleaned.append(author.replace('.', ''))
        if len(cleaned) > 0:
            return cleaned
    return np.nan
          
openaire_publications_df['authors_list'] = [clean_authors(a) for a in openaire_publications_df['authors_list']]

### OpenAIRE Abstracts

In [None]:
openaire_abstracts_df = pd.read_csv(os.path.join(inter_data_path, 'openaire_publication_abstracts_20192302.csv'))

In [None]:
openaire_abstracts_df.head()

In [None]:
openaire_publications_df = openaire_publications_df.merge(
    openaire_abstracts_df,
    left_on='index',
    right_on='index',
    how='left',
    suffixes=('', 'abs')
)

In [None]:
openaire_publications_df.drop('abstractabs', axis=1, inplace=True)

#### Labelling by subject

In [None]:
from eu_funding.utils.nlp_utils import remove_markup, normalise_digits, lemmatize, bigram, stringify_docs
import spacy
from gensim.models.phrases import Phraser, Phrases
from sklearn.externals import joblib

nlp = spacy.load('en')
nlp.remove_pipe('parser')
nlp.remove_pipe('ner')

vocab = spacy.vocab.Vocab().from_disk(os.path.join(model_path, 'gtr_discipline_vocab'))
nlp.vocab = vocab
bigrammer = Phraser(Phrases())
bigrammer = bigrammer.load(os.path.join(model_path, 'gtr_discipline_bigrammer.pkl'))

subject_clf_pipe = joblib.load(os.path.join(model_path, 'gtr_discipline_lvl9_lr_20190222.pkl'))

##### OpenAIRE

In [None]:
openaire_publications_df['title_abstract'] = (openaire_publications_df['abstract'].fillna('') + ' '
                                              + openaire_publications_df['title_mag'])
openaire_publications_df['title_abstract'].fillna('title and abstract missing', inplace=True)

In [None]:
abstracts = [remove_markup(a) for a in openaire_publications_df['title_abstract']]
abstracts = [normalise_digits(a) for a in abstracts]
abstracts = lemmatize(abstracts, nlp)
abstracts = bigram(abstracts, phraser=bigrammer)
abstracts_str = list(stringify_docs(abstracts))

In [None]:
openaire_publications_df['subject_clf'] = subject_clf_pipe.predict(abstracts_str)
openaire_publications_df['subject_clf'][
openaire_publications_df['title_abstract'] == 'title and abstract missing'] = np.nan

##### CORDIS

In [None]:
cordis_abstracts = [remove_markup(a) for a in cordis_projects_df['objective']]
cordis_abstracts = [normalise_digits(a) for a in cordis_abstracts]
cordis_abstracts = lemmatize(cordis_abstracts, nlp)
cordis_abstracts = bigram(cordis_abstracts, phraser=bigrammer)
cordis_abstracts_str = list(stringify_docs(cordis_abstracts))

In [None]:
cordis_projects_df['subject_clf'] = subject_clf_pipe.predict(cordis_abstracts_str)

#### Expand OpenAIRE Publications

In [None]:
expanded_records = []
for record in openaire_publications_df.to_dict(orient='records'):
    ec_project_codes = record.pop('ec_project_codes')
    for ec_code in ec_project_codes:
        new_record = record.copy()
        new_record['ec_project_code'] = int(ec_code)
        expanded_records.append(new_record)

In [None]:
oa_pubs_df = pd.DataFrame().from_records(expanded_records)

### OpenAIRE Children

In [None]:
child_records = []
for i, record in zip(oa_pubs_df.index, oa_pubs_df.to_dict(orient='records')):
    children = record['children']
    for child in children:
        child['ec_project_code'] = record['ec_project_code']
        child['index'] = record['index']
        child['oai'] = i
        child_records.append(child)
        
oa_child_df = pd.DataFrame().from_records(child_records)

In [None]:
oa_child_df.drop_duplicates(['name', 'ec_project_code', 'collectedfrom', 'dateofacceptance'], inplace=True)

In [None]:
oa_child_access = pd.get_dummies(oa_child_df['access'])
oa_child_pubtype = pd.get_dummies(oa_child_df['type'])

### Citations

#### Paper Ages

In [None]:
openaire_publications_df['age_years_round'] = np.round(openaire_publications_df['age_years'])
openaire_publications_df = openaire_publications_df[openaire_publications_df['age_years_round'] < 12]

In [None]:
openaire_publications_df['age_years'][openaire_publications_df['age_years'] < 0] = 0
openaire_publications_df['age_years'][openaire_publications_df['age_years'] == 0] = 0.1

In [None]:
plt.plot(openaire_publications_df.groupby('age_years_round')['citations'].mean())

#### N Authors

In [None]:
fig, ax = plt.subplots()
ax.plot(openaire_publications_df.groupby('n_authors')['citations'].median())
ax.set_ylim((0, 200))
ax.set_xlim((1, 100))

#### Subject Normalised Citations

In [None]:
openaire_publications_df.groupby('subject_clf')['citations'].median()

In [None]:
fig, ax = plt.subplots(figsize=(10, 4))
sns.boxplot(
    openaire_publications_df['citations'],
    openaire_publications_df['subject_clf'], 
    ax=ax, color='C0', orient="h",
    flierprops={'alpha': 0.05}
)
ax.set_xscale('log')
# ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right')
ax.set_xlim((1, 11000))
ax.set_ylabel('Subject')
ax.set_xlabel('N Citations')
plt.tight_layout()
plt.show()

In [None]:
openaire_publications_df['citations_normed'] = np.log10((openaire_publications_df['citations'] + 1) / 
                          (openaire_publications_df['n_authors'] * openaire_publications_df['age_years']))

# openaire_publications_df['citations_normed'] = np.log10((openaire_publications_df['citations'] + 1) / 
#                           (openaire_publications_df['age_years']))


citations_normed_mean = openaire_publications_df.groupby('subject_clf')['citations_normed'].transform(np.median)
openaire_publications_df['citations_normed'] = (
    (openaire_publications_df['citations_normed'] / citations_normed_mean) - 1
)

fig, axs = plt.subplots(nrows=2, figsize=(10, 5), gridspec_kw={'height_ratios': [.2, .8]})

axs[0].hist(openaire_publications_df['citations_normed'], bins=100, color='C0')

sns.boxplot(
    openaire_publications_df['citations_normed'],
    openaire_publications_df['subject_clf'], 
    ax=axs[1], color='C0', orient="h",
    flierprops={'alpha': 0.05},
)

# axs[0].set_ylabel('Freqency')
axs[1].set_xlabel('Normalised Citations')
axs[1].set_ylabel('Citations')
plt.tight_layout()
plt.show()

In [None]:
oa_pubs_df['pub_year'] = oa_pubs_df['date_all'].dt.year

In [None]:
oa_pubs_df['citations_year_normed'] = (
    oa_pubs_df['citations'] / 
    oa_pubs_df.groupby(['pub_year'])['citations'].transform(np.mean)
)

In [None]:
oa_pubs_df['citations_normed'] = np.log10((oa_pubs_df['citations_year_normed'] + 1) / 
                          (oa_pubs_df['n_authors'] * oa_pubs_df['age_years']))

# oa_pubs_df['citations_normed'] = np.log10((oa_pubs_df['citations'] + 1) / 
#                           (oa_pubs_df['age_years']))


citations_normed_mean = oa_pubs_df.groupby('subject_clf')['citations_normed'].transform(np.median)
oa_pubs_df['citations_normed'] = (
    (oa_pubs_df['citations_normed'] / citations_normed_mean) - 1
)

In [None]:
cordis_projects_df = cordis_projects_df.merge(
    oa_pubs_df.groupby('ec_project_code')[['citations_year_normed', 'age_years', 'n_authors']].sum().reset_index(),
    left_on='id',
    right_on='ec_project_code',
    how='left',
    suffixes=('', '_sum')
)

In [None]:
cordis_projects_df.head()

In [None]:
cordis_projects_df['citations_normed'] = ((
    (cordis_projects_df['citations_year_normed'] + 1) * cordis_projects_df['project_contribution_mean']) 
    / (openaire_publications_df['n_authors'] * openaire_publications_df['age_years']
))

In [None]:
cordis_projects_df['citations_normed'] = np.log10(
    cordis_projects_df['citations_normed']
    / cordis_projects_df.groupby('subject_clf')['citations_normed'].transform(np.median)
)

In [None]:
fig, axs = plt.subplots(nrows=2, figsize=(10, 5), gridspec_kw={'height_ratios': [.2, .8]})

axs[0].hist(cordis_projects_df['citations_normed'], bins=100, color='C0')

sns.boxplot(
    cordis_projects_df['citations_normed'],
    cordis_projects_df['subject_clf'], 
    ax=axs[1], color='C0', orient="h",
#     flierprops={'alpha': 0.05},
)

# axs[0].set_ylabel('Freqency')
axs[1].set_xlabel('Normalised Citations')
axs[1].set_ylabel('Citations')
plt.tight_layout()
plt.show()

In [None]:
plt.hist(cordis_projects_df['citations_normed'], bins=100)
plt.show()

#### N Publications

#### Publication Type

#### Effect of organisation type

In [None]:
cordis_org_types_ohe = pd.get_dummies(cordis_orgs_df['activityType'])
cordis_orgs_df = cordis_orgs_df.join(cordis_org_types_ohe)

org_types = ['HES', 'OTH', 'PRC', 'PUB', 'REC']

In [None]:
cordis_project_org_counts = cordis_orgs_df.groupby('projectID')[org_types].sum().reset_index()
cordis_project_has_org = cordis_project_org_counts[org_types] > 0
cordis_project_has_org.rename(columns={k: f'has_{k}' for k in cordis_project_has_org.columns}, inplace=True)
cordis_project_org_counts = cordis_project_org_counts.join(cordis_project_has_org)

In [None]:
cordis_projects_df = cordis_projects_df.merge(
    cordis_project_org_counts, 
    left_on='id', right_on='projectID', how='left', suffixes=('', '_orgs')
)
cordis_projects_df.drop('projectID', axis=1, inplace=True)

In [None]:
oa_n_pubs_per_project = oa_pubs_df.groupby(
    'ec_project_code')['index'].count().reset_index().rename(columns={'index': 'n_publications'})
cordis_projects_df = cordis_projects_df.merge(
    oa_n_pubs_per_project, 
    left_on='id', right_on='ec_project_code', how='left', suffixes=('', '_orgs')
)
cordis_projects_df.drop('ec_project_code', axis=1, inplace=True)

cordis_projects_df['has_publications'] = cordis_projects_df['n_publications'] > 0

In [None]:
has_publications = []
for org_type in org_types:
    org_type = f'has_{org_type}'
    has_publications.append(cordis_projects_df.groupby(org_type)['has_publications'].mean()[1])

fig, ax = plt.subplots()
ax.bar(org_types, has_publications)
ax.set_xlabel('Project Has Institution Type')
ax.set_ylabel('Publication Coverage')
plt.show()

In [None]:
project_publication_sums = oa_pubs_df.groupby(
    'ec_project_code')['citations', 'project_contribution'].sum()
project_publication_sums.columns = [f'{c}_sum' for c in project_publication_sums.columns]
project_publication_sums.reset_index(inplace=True)

In [None]:
project_publication_means = oa_pubs_df.groupby(
    'ec_project_code')['citations', 'project_contribution'].mean()
project_publication_means.columns = [f'{c}_mean' for c in project_publication_means.columns]
project_publication_means.reset_index(inplace=True)

In [None]:
((cordis_projects_df['has_HES'] & cordis_projects_df['has_PRC']).sum() / 
(cordis_projects_df['has_HES'] + cordis_projects_df['has_PRC']).sum())

In [None]:
cordis_projects_df[org_types].T.dot(cordis_projects_df[org_types])

## Interim Data Export

In [None]:
cordis_projects_df.columns

In [None]:
cordis_projects_df[['id', 'subject_clf', 'HES', 'OTH',
       'PRC', 'PUB', 'REC', 'has_HES', 'has_OTH', 'has_PRC', 'has_PUB',
       'has_REC', 'n_publications', 'has_publications', 'citations_mean',
       'project_contribution_mean', 'citations_sum',
       'project_contribution_sum']].to_csv(
    os.path.join(inter_data_path, 'cordis_projects_enrichments_20190226.csv'),
    index=False
)

In [None]:
openaire_publications_df[['index', 'subject_clf', 'age', 'age_years', 'authors_list', 'n_authors', 'date_all',
                          'age', 'age_years_round', 'citations_normed', 'title_abstract']].to_csv(
    os.path.join(inter_data_path, 'openaire_publications_enrichment_20190226.csv'),
    index=False,
    chunksize=5000
)

### Linking Fields

- Make cooccurrence matrix for each year
- For each year, make cooccurrence matrix cumulative
- For each paper, query the year and get the added knowledge (paper + year) / year

In [None]:
openaire_publications_df['publication_year'] = openaire_publications_df['date_all'].dt.year

In [None]:
import networkx as nx

In [None]:
from gensim.corpora import Dictionary
from itertools import combinations

In [None]:
field_dictionary = Dictionary(openaire_publications_df['field_names'])

In [None]:
openaire_publications_df['field_ids'] = [
    field_dictionary.doc2idx(fn) for fn in openaire_publications_df['field_names']
]

In [None]:
from networkx import adjacency_matrix

In [None]:
years = openaire_publications_df['publication_year'].sort_values().unique()

In [None]:
co_graphs = {}

# openaire_publications_df.reset_index(inplace=True)
openaire_publications_df.set_index('publication_year', inplace=True)

for year in years:
    g_year = nx.Graph()
    g_year.add_nodes_from(field_dictionary.iterkeys())
    fields = openaire_publications_df.loc[year]['field_ids']
    field_combos = [combinations(f, 2) for f in (fields)]
    field_counter = Counter(chain(*field_combos))
    for f, count in field_counter.items():
        g_year.add_edge(f[0], f[1], weight=count)
    co_graphs[year] = g_year

openaire_publications_df.reset_index(inplace=True)

In [None]:
cumulative_adjacency_matrices = {}
for year, graph in co_graphs.items():
    adj = adjacency_matrix(graph)
    if year - 1 in co_graphs:
        adj = adj + cumulative_adjacency_matrices[year - 1]
    cumulative_adjacency_matrices[year] = adj

In [None]:
cumulative_adjacency_matrices[2013][900, 900]

In [None]:
end = None
knowledge_combination_enhancement = []

for year, fields in zip(openaire_publications_df['publication_year'][:end], openaire_publications_df['field_ids'][:end]):
    if (year - 1 in cumulative_adjacency_matrices) & (len(fields) > 0):
        adj_matrix = cumulative_adjacency_matrices[year - 1]
        combos = combinations(fields, 2)
        added_contributions = []
        for combo in combos:
            previous_link_weight = adj_matrix[combo[0], combo[1]]
            if previous_link_weight == 0:
                added_contributions.append(1)
            else:
                new_link_weight = previous_link_weight + 1
                added_contributions.append(new_link_weight / previous_link_weight)
        added_contribution = np.mean(added_contributions)
        knowledge_combination_enhancement.append(added_contribution)
    else:
        knowledge_combination_enhancement.append(np.nan)

In [None]:
openaire_publications_df['k_factor'] = knowledge_combination_enhancement

In [None]:
openaire_publications_df['k_factor_yearnorm'] = (
    openaire_publications_df['k_factor'] / 
    openaire_publications_df.groupby('publication_year')['k_factor'].transform(np.mean))

In [None]:
openaire_publications_df['k_factor_subj_year_norm'] = (
    openaire_publications_df['k_factor'] / 
    openaire_publications_df.groupby(['publication_year', 'subject_clf'])['k_factor'].transform(np.median))

In [None]:
for title in openaire_publications_df[['k_factor_subj_year_norm', 'title_mag', 'field_names']].sort_values(
    'k_factor_subj_year_norm', ascending=True)['title_mag'].iloc[:20]:
    print(title)

In [None]:
fig, ax = plt.subplots()
ax.hexbin(
    openaire_publications_df['citations_normed'],
    openaire_publications_df['k_factor_subj_year_norm'],
#     alpha=0.05,
    bins='log'
)
ax.set_xlabel('Normalised Citations')
ax.set_ylabel('Normlised K Factor')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 4))
sns.boxplot(
    openaire_publications_df['k_factor_yearnorm'],
    openaire_publications_df['subject_clf'], 
    ax=ax, color='C0', orient="h",
    flierprops={'alpha': 0.05}
)
# ax.set_xscale('log')
# ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right')
# ax.set_xlim((1, 11000))
ax.set_xlabel('K Factor')
ax.set_ylabel('Subject')
plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(nrows=2, figsize=(10, 5), gridspec_kw={'height_ratios': [.2, .8]})

axs[0].hist(openaire_publications_df['k_factor_subj_year_norm'], bins=100, color='C0')

sns.boxplot(
    openaire_publications_df['k_factor_subj_year_norm'],
    openaire_publications_df['subject_clf'], 
    ax=axs[1], color='C0', orient="h",
    flierprops={'alpha': 0.05},
)

# axs[0].set_ylabel('Freqency')
axs[1].set_xlabel('Normalised K Factor')
axs[1].set_ylabel('Subject')
plt.tight_layout()
plt.show()

### Similarity

In [None]:
cordis_projects_df['objective'][:5]

In [None]:
cordis_projects_df.head()

In [None]:
cordis_projects_df.reset_index(inplace=True)
cordis_projects_df.rename(columns={'index': 'index_'}, inplace=True)
cordis_projects_df['index_'] = cordis_projects_df['index_'].astype(str) + '_proj'

In [None]:
cordis_reports_df.reset_index(inplace=True)
cordis_reports_df.rename(columns={'index': 'index_'}, inplace=True)
cordis_reports_df['index_'] = cordis_reports_df['index_'].astype(str) + '_report'

In [None]:
openaire_publications_df['index_'] = openaire_publications_df['index']
openaire_publications_df['index_'] = openaire_publications_df['index_'].astype(str) + '_pub'

In [None]:
cordis_reports_df['full_text'] = (cordis_reports_df['summary'].fillna('') + ' '
                                  + cordis_reports_df['workPerformed'].fillna('') + ' ' 
                                  + cordis_reports_df['article'].fillna(''))

In [None]:
cordis_report_text = [remove_markup(a) for a in cordis_reports_df['full_text']]
cordis_report_text = [normalise_digits(a) for a in cordis_report_text]
cordis_report_text = lemmatize(cordis_report_text, nlp)
cordis_report_text = bigram(cordis_report_text, phraser=bigrammer)
cordis_report_text_str = list(stringify_docs(cordis_report_text))

In [None]:
from nesta.packages.nlp_utils.preprocess import clean_and_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
n_docs = len(abstracts) + len(cordis_abstracts) + len(cordis_report_text)

In [None]:
docs = [TaggedDocument(d, [i]) for i, d in enumerate(chain(*[abstracts, cordis_abstracts, cordis_report_text]))]

In [None]:
d2v = Doc2Vec(documents=docs, size=200)

In [None]:
cordis_reports_df['projectID'].value_counts()[:5]

In [None]:
cordis_reports_df.set_index('projectID', inplace=True)

In [None]:
cordis_reports_df[]

In [None]:
from scipy.spatial.distance import cosine

In [None]:
type(cordis_reports_df['parsed_abstract'].loc[287600])

In [None]:
mean_dists = []
for pid, abstract in zip(cordis_projects_df['id'][:20], cordis_projects_df['parsed_abstract'][:20]):
    abstract_vector = d2v.infer_vector(abstract)
    if pid in cordis_reports_df:
        report_abstracts = cordis_reports_df.loc[pid]['parsed_abstract']
        if len(report_abstracts) > 0:
            if type(report_abstracts) == list:
                dists = []
                for ra in report_abstracts:
                    dists.append(cosine(abstract_vector, d2v.infer_vector(ra)))
                mean_dists.append(np.mean(dists))
            elif type(report_abstracts) == pd.core.series.Series:
                dists.append(cosine(abstract_vector, d2v.infer_vector(report_abstracts)))
                mean_dists.append(np.mean(dists))
        else:
            mean_dists.append(np.nan)
    else:
        mean_dists.append(np.nan)

In [None]:
mean_dists

In [None]:
cordis_reports_df['parsed_abstract'] = cordis_report_text

In [None]:
d2v.infer_vector(abstracts[100])

### N Publications

In [None]:
cordis_projects_df.head()

In [None]:
end = 30
fig, ax = plt.subplots()
cordis_proj_participant_count_group = cordis_projects_df.groupby('participant_count')
n_particpant_pub_med = cordis_proj_participant_count_group['project_contribution_sum'].median()[:end]
n_particpant_pub_lq = cordis_proj_participant_count_group['project_contribution_sum'].quantile(0.25)[:end]
n_particpant_pub_uq = cordis_proj_participant_count_group['project_contribution_sum'].quantile(0.75)[:end]
ax.plot(n_particpant_pub_med, linewidth=3, color='C7', label='Median')
ax.plot(n_particpant_pub_lq, color='C3', label='Lower Quartile')
ax.plot(n_particpant_pub_uq, color='C2', label='Upper Quartile')
ax.set_xlabel('N Participant Institutions')
ax.set_ylabel('Publication Contributions')
ax.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
n_publications = oa_cr_pubs_df.groupby('ec_project_code')['doi'].count()
n_publication_children = oa_child_df.groupby('ec_project_code')['access'].count()
n_citations = oa_cr_pubs_df.groupby('ec_project_code')['is-referenced-by-count'].sum()

### Access Type

In [None]:
oa_child_df_access_types = pd.get_dummies(oa_child_df['access'])

In [None]:
oa_child_df = oa_child_df.join(pd.get_dummies(oa_child_df['access']))

In [None]:
proj_access_counts = oa_child_df.groupby(
    'ec_project_code'
)['Closed Access', 'Embargo', 'Open Access', 'Restricted', 'UNKNOWN'].sum()

In [None]:
pubs_sum = proj_access_counts.sum(axis=1)

In [None]:
# calculate the normalised distribution of access types for each project
access_types = ['Closed Access', 'Embargo', 'Open Access', 'UNKNOWN', 'Restricted']

for a_t in access_types:
    proj_access_counts[a_t] = proj_access_counts[a_t] / pubs_sum

In [None]:
cordis_orgs_df['activityType'].fillna('OTH', inplace=True)

In [None]:
cordis_activity_types_ohe = pd.get_dummies(cordis_orgs_df['activityType'])

In [None]:
cordis_activity_types_ohe['projectID'] = cordis_orgs_df['projectID']

In [None]:
cordis_proj_activity_dist = cordis_activity_types_ohe.groupby('projectID').sum()
cordis_proj_total_orgs = cordis_proj_activity_dist.sum(axis=1)
for a_t in cordis_activity_ref_df['Code']:
    cordis_proj_activity_dist[a_t] = cordis_proj_activity_dist[a_t] / cordis_proj_total_orgs

In [None]:
access_org_type_df = cordis_proj_activity_dist.merge(
    proj_access_counts, 
    left_index=True, 
    right_index=True,
    how='inner'
)

In [None]:
fig, ax = plt.subplots(nrows=len(cordis_activity_ref_df['Code']), ncols=len(access_types), figsize=(16, 12))

for i, activity in enumerate(cordis_activity_ref_df['Code']):
    for j, access in enumerate(access_types):
        ax[i][j].hexbin(
            access_org_type_df[access],
            access_org_type_df[activity],
#             alpha=0.1,
            bins='log',
            mincnt=1
        )
        ax[i][j].set_xlabel(activity)
        ax[i][j].set_ylabel(access)
plt.tight_layout()
plt.show()

In [None]:
cordis_activity_ref_df

In [None]:
oa_child_df.head()

In [None]:
n_nondupes = oa_child_df.drop_duplicates(['name', 'access']).shape[0]

In [None]:
(oa_child_df.drop_duplicates(['name', 'access'])['access'].value_counts() / n_nondupes)

In [None]:
fig, ax = plt.subplots()

(oa_child_df.drop_duplicates(['name', 'access'])['access'].value_counts() / n_nondupes).plot(kind='barh', color='C0', ax=ax)
ax.invert_yaxis()
ax.set_xlabel('Frequency')
ax.set_ylabel('Access Type')
plt.tight_layout()
plt.show()

In [None]:
oa_child_access_ohe = pd.get_dummies(oa_child_df.drop_duplicates(['name', 'access', 'type'])['access'])

In [None]:
oa_child_access_ohe['ec_project_code'] = oa_child_df['ec_project_code']

In [None]:
oa_child_project_access = oa_child_access_ohe.groupby('ec_project_code').sum()

In [None]:
oa_child_project_access.reset_index(inplace=True)

In [None]:
oa_child_type_ohe = pd.get_dummies(oa_child_df.drop_duplicates(['name', 'type'])['type'])

In [None]:
oa_child_type_ohe['ec_project_code'] = oa_child_df.drop_duplicates(['name', 'type'])['ec_project_code']

In [None]:
oa_child_type_sum = oa_child_type_ohe.groupby('ec_project_code').sum()

In [None]:
oa_child_type_sum.reset_index(inplace=True)

In [None]:
oa_child_type_sum = oa_child_type_sum.merge(oa_child_project_access, left_on='ec_project_code', right_on='ec_project_code', suffixes=('', '_'))

In [None]:
oa_child_type_sum.to_csv(os.path.join(inter_data_path, 'cordis_oa_child_enrichment_20190226.csv'))

In [None]:
openaire_publications_df.head()

In [None]:
oa_child_type_ohe = pd.get_dummies(oa_child_df['type'])
oa_child_df = pd.concat([oa_child_df, oa_child_type_ohe], axis=1)

In [None]:
oa_child_df = oa_child_df.merge(
    openaire_publications_df[['index', 'subject_clf']],
    left_on='oai',
    right_on='index',
    how='left'
)

In [None]:
type_subject_pivot = oa_child_df.drop_duplicates(['name', 'type']).pivot_table(values='oai', index='subject_clf', columns='type', aggfunc='count')

In [None]:
type_subject_pivot_norm = type_subject_pivot.divide(type_subject_pivot.sum(axis=1), axis=0)

In [None]:
fig, ax = plt.subplots(figsize=(13, 4))
sns.heatmap((type_subject_pivot_norm * 100).fillna(0), annot=True, fmt='.2f', ax=ax, cbar=False, cmap='viridis')
ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right')
ax.set_xlabel('Output Type')
ax.set_ylabel('Subject')
# plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots()
type_subject_pivot.plot.bar(stacked=True, ax=ax, cmap='tab20')
ax.legend(loc='center left', bbox_to_anchor= (1.1, 0.5), ncol=2, 
            borderaxespad=0, frameon=False)


#### Access Distributions

In [None]:
for activity in cordis_activity_ref_df['Code']:
    access_org_type_df['has_{}'.format(activity)] =  access_org_type_df[activity] > 0

In [None]:
for access in access_types:
    access_org_type_df['has_{}'.format(access)] =  access_org_type_df[access] > 0

In [None]:
fig, ax = plt.subplots(nrows=4, figsize=(6, 8))

access_types_known = [a for a in access_types if a != 'UNKNOWN']

for i, access in enumerate(access_types_known):
    pdf_cdf(
        ax[i],
        access_org_type_df[access],
        bins=20
    )
    ax[i].set_xlabel(f'% of {access} Outputs')
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(ncols=3, figsize=(16,4))
ax[0].hist(
    access_org_type_df['has_Open Access'][
        (access_org_type_df['has_PRC'] == True)],
)
ax[1].hist(
    access_org_type_df['has_Open Access'][
        (access_org_type_df['has_PRC'] == False) 
#         & (access_org_type_df['has_PUB'] == True)
    ],
)
ax[2].hist(
    access_org_type_df['has_Open Access'][
        (access_org_type_df['has_PRC'] == False) &
        (access_org_type_df['has_PUB'] == True)],
#     density='cumulative'
)
plt.show()

In [None]:
access_org_type_df[
    (access_org_type_df['has_PRC'] == True) & 
    (access_org_type_df['has_HES'] == False)]['has_Closed Access'].mean()

In [None]:
access_org_type_df.groupby('has_PRC')[['{}'.format(a) for a in access_types]].mean()

### Publication Type

In [None]:
oa_child_df['type'].fillna('Article', inplace=True)

In [None]:
cordis_project_codes = sorted(cordis_projects_df['id'])

In [None]:
oa_child_df['in_cordis'] = oa_child_df['ec_project_code'].apply(lambda x: x in cordis_project_codes)

In [None]:
fig, ax = plt.subplots()

oa_child_df['type'].value_counts().plot(kind='barh', color='C0', ax=ax)
ax.invert_yaxis()
ax.set_xscale('log')
ax.set_xlabel('Frequency')
ax.set_ylabel('Publication Type')

In [None]:
type_access_pivot = oa_child_df.pivot_table(values='ec_project_code', index='type', columns='access', aggfunc='count')
type_access_pivot = type_access_pivot.sort_values('Open Access', ascending=False)
type_access_pivot_norm = type_access_pivot.divide(type_access_pivot.sum(axis=1), axis=0) * 100
# type_access_pivot.fillna(0, inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(
    type_access_pivot_norm[['Open Access', 'Closed Access', 'Restricted', 'Embargo', 'UNKNOWN']],
    cmap='viridis',
    ax=ax,
    annot=True,
    fmt='.1f',
    cbar_kws={'label': '% with Access Type'}
)
ax.set_xlabel('Access Type')
ax.set_ylabel('Publication Type')
ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right')
plt.show()

### Author Counts

In [None]:
from eu_funding.utils.misc_utils import Groupby

In [None]:
def count_authors(authors_list):
    author_set = []
    for authors in authors_list:
        if not pd.isnull(authors):
            pub_authors = authors.split(';')
            author_set.extend(pub_authors)
    author_set = set(author_set)
    return len(author_set)

In [None]:
groupby = Groupby(oa_cr_pubs_df['ec_project_code'])
author_counts = groupby.apply(function_=count_authors, array=oa_cr_pubs_df['authors'], broadcast=True)
oa_cr_pubs_df['author_counts'] = author_counts
project_author_counts = oa_cr_pubs_df.groupby('ec_project_code')['author_counts'].max()

#### Pub Type by Discipline

In [None]:
pub_types = oa_child_df['type'].unique()

In [None]:
oa_child_df_pub_types = pd.get_dummies(oa_child_df['type'])
oa_child_df_pub_types['ec_project_code'] = oa_child_df['ec_project_code']

In [None]:
proj_pub_type_counts = oa_child_df_pub_types.groupby(
    'ec_project_code'
)[pub_types].sum()

In [None]:
pubs_sum = proj_access_counts.sum(axis=1)

In [None]:
for p_t in pub_types:
    proj_pub_type_counts[f'{p_t}_normed'] = proj_pub_type_counts[p_t] / pubs_sum
    proj_pub_type_counts[f'{p_t}_normed'][np.isinf(proj_pub_type_counts[f'{p_t}_normed'])] = 0

In [None]:
proj_pub_type_counts.reset_index(inplace=True)

In [None]:
with open(os.path.join(model_path, 'gtr_text_models.p'), 'rb') as f:
    gtr_discipline_model = pickle.load(f)

In [None]:
abstract_vecs = gtr_discipline_model[0].transform(cordis_projects_df['objective'])

In [None]:
discipline_labels = gtr_discipline_model[1].predict(abstract_vecs)

In [None]:
category_name_lookup = {6:'biological_sciences',
                        3:'physics',
                        0:'engineering_technology',
                        1:'environmental_sciences',
                        2:'social_sciences',
                        4:'arts_humanities',
                        5:'mathematics_computing',
                        7: 'medical_sciences'}

In [None]:
cordis_projects_disciplines_df = pd.DataFrame(discipline_labels).rename(columns=category_name_lookup)

In [None]:
cordis_projects_df.reset_index(inplace=True)
cordis_projects_df.drop('index', inplace=True, axis=1)

In [None]:
cordis_project_publications_df = cordis_projects_df.join(
    cordis_projects_disciplines_df).merge(
    proj_pub_type_counts, 
    left_on='id', 
    right_on='ec_project_code',
    how='inner'
)

In [None]:
cordis_project_publications_df

Groupby discipline. Calculate sum of each output type. Put in table.

In [None]:
discipline_pub_type_counts = []

for _, discipline in category_name_lookup.items():
    x = cordis_project_publications_df.groupby(discipline)[pub_types].sum().to_dict(orient='records')[1]
    x['discipline'] = discipline
    discipline_pub_type_counts.append(x)

In [None]:
disci_pub_type_df = pd.DataFrame().from_records(discipline_pub_type_counts).set_index('discipline')

In [None]:
fig, ax = plt.subplots()
disci_pub_type_df.plot(kind='barh', stacked=True, ax=ax, colormap='tab20')
ax.legend(loc='right', bbox_to_anchor=(2.5, .5), ncol=2, 
            borderaxespad=0, frameon=False)
ax.set_xlabel('Frequency')
plt.show()

In [None]:
cordis_project_publications_df.head()

### Lexical Diversity

### Collaborations

In [None]:
authors = oa_pubs_df['authors'].str.split(';')

In [None]:
c_authors = Counter(chain(*[a for a in authors if type(a) == list]))

In [None]:
authors_h = [tuple(a) for a in authors if type(a) == list]

In [None]:
c_authors = Counter(authors_h)

In [None]:
c_authors.most_common(1000)

### Extra Stuff

In [None]:
project_counts = [len(s) for s in openaire_publications_df['ec_project_codes']]
project_counts_c = Counter(project_counts)
n_codes, n_codes_count = zip(*project_counts_c.items())

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
ax.bar(n_codes, n_codes_count)
ax.set_xlabel('N EC Projects')
ax.set_ylabel('Frequency')
plt.show()

Around 50% of projects have no EC project associated. However, over 120,000 of them have at least one. Over 10,000 are attributed to two projects, while a smaller handful are associated with 3 or more.

In [None]:
def flatten(nested):
    return list(chain(*nested))

In [None]:
publication_types = flatten(openaire_publications_df['child_types'])
c_types = Counter(publication_types)
pub_type, pub_type_count = zip(*c_types.items())

fig, ax = plt.subplots(figsize=(8,4))
ax.barh(pub_type, pub_type_count)
ax.set_xlabel('Publication Type')
ax.set_ylabel('Frequency')
plt.show()

In [None]:
access_types = flatten(openaire_publications_df['child_access'])
c_access = Counter(access_types)
access_type, access_type_count = zip(*c_access.items())

fig, ax = plt.subplots(figsize=(8,4))
ax.barh(access_type, access_type_count)
ax.set_xlabel('Child Access Type')
ax.set_ylabel('Frequency')
plt.show()

In [None]:
accessright_counts = openaire_publications_df['bestaccessright'].value_counts()
fig, ax = plt.subplots(figsize=(8,4))
ax.barh(accessright_counts.index, accessright_counts.values)
ax.set_xlabel('Publication Access Type')
ax.set_ylabel('Frequency')
plt.show()

In [None]:
accessright_counts = openaire_publications_df['publication_type'].value_counts()
fig, ax = plt.subplots(figsize=(8,4))
ax.barh(accessright_counts.index, accessright_counts.values)
ax.set_xlabel('Publication Access Type')
ax.set_ylabel('Frequency')
plt.show()

In [None]:
langauge_counts = openaire_publications_df['language'].value_counts()
fig, ax = plt.subplots(figsize=(8,8))
ax.barh(langauge_counts.index, langauge_counts.values)
ax.set_xlabel('Language')
ax.set_ylabel('Frequency')
plt.show()

In [None]:
category_c = Counter(flatten(openaire_publications_df['categories']))

In [None]:
category_c.most_common(40)

### CrossRef Works

In [None]:
crossref_works_records = []

for file in os.listdir(crossref_works_path):
    if '.txt' in file:
        with open(os.path.join(crossref_works_path, file), 'r') as f:
            crossref_works_records.extend(json.load(f))

In [None]:
crossref_works_records = [c for c in crossref_works_records if type(c) == dict]

In [None]:
crossref_works_df = pd.DataFrame().from_records(crossref_works_records)

In [None]:
def parse_date_parts(date_dict):
    date_parts = date_dict.get('date-parts')
    if date_parts is not None:
        date_parts = date_parts[0]
        if date_parts[0] is not None:
            fill = 3 - len(date_parts)
            for _ in range(fill):
                date_parts.append(1)
            dt = datetime(*date_parts)
            return dt
    return None

In [None]:
crossref_works_df['issued_dt'] = crossref_works_df['issued'].apply(lambda x: parse_date_parts(x))

In [None]:
collected_date = datetime(2019, 2, 1)

In [None]:
crossref_works_df['age'] = collected_date - crossref_works_df['issued_dt']

In [None]:
crossref_works_df['age_years'] = crossref_works_df['age'].dt.days / 365.25

In [None]:
crossref_works_df = crossref_works_df[crossref_works_df['age_years'] <= 20]

In [None]:
crossref_works_df['type'].value_counts()

In [None]:
plt.hist(crossref_works_df['age_years'], bins=100)
plt.show()

In [None]:
ref_per_year = crossref_works_df['is-referenced-by-count'] / np.power(crossref_works_df['age_years'], 2)
# ref_per_year_log = np.log10(ref_per_year)
# ref_per_year_log = ref_per_year_log[~np.isinf(ref_per_year_log)]

plt.hist(ref_per_year, bins=1000)
# plt.xlim((0, 50))
plt.show()

In [None]:
plt.hexbin(
    crossref_works_df['age_years'],
    np.log(crossref_works_df['is-referenced-by-count']),
#     bins='log',
    gridsize=40,
)

In [None]:
crossref_works_df.head()

In [None]:
crossref_works_df['is-referenced-by-count'].describe()

In [None]:
plt.hist(crossref_works_df['is-referenced-by-count'], bins=100)
plt.show

### Lexical Diversity

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
import ast

In [None]:
cv = CountVectorizer(dtype=np.int32)

In [None]:
from skbio.diversity.alpha import shannon

In [None]:
sum(pd.isnull(openaire_publications_df['language']))

In [None]:
entropies = []
lengths = []
n_papers = []

for pid in cordis_projects_df['id']:
    papers = openaire_publications_df[[True if str(pid) in s else False 
                                       for s in openaire_publications_df['ec_project_codes']]]
    papers = papers[~pd.isnull(papers['abstract'])]
    try:
        main_lang = papers['language'].value_counts().index[0]
        papers = papers[papers['language'] == main_lang]
        paper_count = len(papers)
    except:
        paper_count = len(papers)
    n_papers.append(paper_count)
    if paper_count > 0:
        mega_abstract = [' '.join([s for s in papers])]
        article_bow = cv.fit_transform(mega_abstract)
        for bow in article_bow:
            x = np.squeeze(np.asarray(bow.todense()))
            x = x[x != 0]
            entropy = shannon(x)
            lengths.append(bow.sum())
            entropies.append(entropy)
        
    else:
        entropies.append(0)
        lengths.append(0)


In [None]:
papers

In [None]:
fig, ax = plt.subplots()
ax.scatter(n_papers, entropies, alpha=0.05)
# ax.set_xscale('log')

In [None]:
def func(x, a, b, c):
    return a * np.exp(-b * x) + c

In [None]:
from scipy.optimize import curve_fit

In [None]:
import operator

In [None]:
n_papers, entropies = zip(*sorted(zip(n_papers, entropies),
  key=operator.itemgetter(0), reverse=True))

In [None]:
mask = ~np.isinf(entropies)
n_papers_mask = np.array(n_papers)[mask]
entropies_mask = entropies[mask]

In [None]:
popt, pcov = curve_fit(func, n_papers_mask, entropies_mask)

In [None]:
popt

In [None]:
plt.scatter(n_papers_mask, entropies_mask)
plt.plot(np.array(n_papers), func(np.array(n_papers), *popt), 'r-',
         label='fit: a=%5.3f, b=%5.3f, c=%5.3f' % tuple(popt))


In [None]:
dist = func(np.array(n_papers), *popt)

In [None]:
plt.scatter(np.log(n_papers), np.log(1 / (entropies - dist)))

In [None]:
pid = 202008
papers = papers = openaire_publications_df[[True if str(pid) in s else False 
                                       for s in openaire_publications_df['ec_project_codes']]]
papers = papers[~pd.isnull(papers)]

In [None]:
papers

In [None]:
papers['language'].value_counts().index[0]

In [None]:
cordis_projects_df[cordis_projects_df['id'] == pid]

In [None]:
cordis_projects_df['oa_abstract_entropy_mean'] = entropies
cordis_projects_df['oa_n_papers'] = n_papers
cordis_projects_df['oa_abstract_lengths'] = lengths

In [None]:
from datetime import datetime

In [None]:
closed_projects = cordis_projects_df[cordis_projects_df['endDate'] < datetime.today()]

In [None]:
plt.hist(entropies, bins=50)
plt.show()

In [None]:
from eu_funding.data.parse import concat_entities

In [None]:
cordis_projects_df

In [None]:
cordis_topics_ref_df.head()

In [None]:
cordis_projects_df.topics.value_counts()