Looks at all unique PRO items from a given timeframe, whether or not they have a DOI

# Load Data

In [1]:
import json, csv
import sys
import requests
import pandas as pd
import re
import cufflinks as cf
import numpy as np

import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [2]:
#make df from CSV if you don't need to add new DOIs
sample = pd.read_csv('2017_PRO_for_Unpaywall_Complete.csv', encoding='ISO-8859-1')

In [3]:
sample.shape

(1680, 18)

In [4]:
#drop duplicate GroupID_Final to get list of unique items
sample = sample.drop_duplicates(['GroupID_Final'], keep='first')

In [5]:
sample.shape

(1680, 18)

In [6]:
total = len(sample['GroupID_Final'])
print(total)

1680


# Articles Free to Read

In [13]:
sample['Free_to_Read'].value_counts()

Unknown    739
True       554
False      387
Name: Free_to_Read, dtype: int64

In [15]:
#open articles by year (for only rows with a DOI)
open_dois = ((sample[sample['Free_to_Read']=='True']['Year'].value_counts())/total)*100
closed_dois = ((sample[sample['Free_to_Read']=='False']['Year'].value_counts())/total)*100
unknown_dois = ((sample[sample['Free_to_Read']=='Unknown']['Year'].value_counts())/total)*100

df1 = pd.DataFrame([open_dois,closed_dois,unknown_dois])
df1.index = ['True','False','Unknown']
df1.iplot(kind='bar',barmode='stack', title='Percentage of Articles that are Free to Read')

# Type of URL

In [16]:
sample['Open_Type'].value_counts()

None             1126
Non-KUSW-link     517
KUSW               37
Name: Open_Type, dtype: int64

In [17]:
#open articles by year
kusw = sample[sample['Open_Type']=='KUSW']['Open_Type'].value_counts()
non_kusw = sample[sample['Open_Type']=='Non-KUSW-link']['Open_Type'].value_counts()
no_info = sample[sample['Open_Type']=='None']['Open_Type'].value_counts()
df2 = pd.DataFrame([kusw,non_kusw,no_info])
df2.index = ['KUSW','Non-KUSW-link','None']
df2.iplot(kind='bar', barmode='stack', title='Types of URLs')

# Publisher Policies

In [18]:
afd = sample[sample['Rights_to_share']=='AFD']['Rights_to_share'].value_counts()
unknown = sample[sample['Rights_to_share']=='Unclear/Unknown']['Rights_to_share'].value_counts()
published = sample[sample['Rights_to_share']=='Published']['Rights_to_share'].value_counts()
no_share = sample[sample['Rights_to_share']=='May not share']['Rights_to_share'].value_counts()

df3 = pd.DataFrame([afd,published,no_share,unknown])
df3.index = ['AFD','Published','May not share','Unclear/Unknown']
df3.iplot(kind='bar',barmode='stack', title='Publisher Policies for All Articles')

# Publisher Policies for Open Items

In [19]:
open_dois = sample[sample['Free_to_Read']=='True']

In [20]:
afd = open_dois[open_dois['Rights_to_share']=='AFD']['Rights_to_share'].value_counts()
unknown = open_dois[open_dois['Rights_to_share']=='Unclear/Unknown']['Rights_to_share'].value_counts()
published = open_dois[open_dois['Rights_to_share']=='Published']['Rights_to_share'].value_counts()
no_share = open_dois[open_dois['Rights_to_share']=='May not share']['Rights_to_share'].value_counts()

df3 = pd.DataFrame([afd,published,no_share,unknown])
df3.index = ['AFD','Published','May not share','Unclear/Unknown']
df3.iplot(kind='bar',barmode='stack', title='Publisher Policies for Articles that are Free to Read')