In [294]:
import pandas as pd
import numpy as np
import urllib.request as urlreq
from scipy.optimize import curve_fit
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True) #enables plotly offline plotting
#Change your username and set the path for your own plot.ly API key
py.tools.set_credentials_file(username='gianlucahmd', api_key=open('/Users/Gianluca/Desktop/plotly_key.txt', "r").read())

# Defining basic functions

In [295]:
#Set the path for your own Science Direct API key
apikey = str(open("Desktop/SDapikey.txt", "r").read())
#My personal path for exporting/importing files
path = "Documents/data_stuff/AI_research/"

#performs query on Science Direct's API and parses the .json response
def get_sd(query, subject, year):
    query = query.replace(" ", "+")
    url = ("http://api.elsevier.com/content/search/scopus?query=" + query +
            "&subj=" + subject +
            "&date=" + str(year) +
            "&apiKey=" + apikey)
    return pd.read_json( urlreq.urlopen(url).read() )

#Returns the amount of publications in a specified year, for a certain query and subject
def pub_number(query, subject, year):
    return int(get_sd(query, subject, year).loc["opensearch:totalResults","search-results"])

#Returns an array with the timeline of publications for a certain query and subject
#ranging in a timeframe from "start" to "end"
def pub_timeline(query, subject, start, end):
    timeline = []
    for year in range(start, end):
        timeline.append(pub_number(query, subject, year))
    return timeline

# List of SD API subject keys

* AGRI -	Agricultural and Biological Sciences 
* ARTS -	Arts and Humanities 
* BIOC -	Biochemistry, Genetics and Molecular Biology 
* BUSI -	Business, Management and Accounting 
* CENG -	Chemical Engineering 
* CHEM -	Chemistry 
* COMP -	Computer Science 
* DECI -	Decision Sciences 
* DENT -	Dentistry 
* EART -	Earth and Planetary Sciences 
* ECON -	Economics, Econometrics and Finance 
* ENER -	Energy 
* ENGI -	Engineering 
* ENVI -	Environmental Science 
* HEAL -	Health Professions 
* IMMU -	Immunology and Microbiology 
* MATE -	Materials Science 
* MATH -	Mathematics 
* MEDI -	Medicine 
* NEUR -	Neuroscience 
* NURS -	Nursing 
* PHAR -	Pharmacology, Toxicology and Pharmaceutics 
* PHYS -	Physics and Astronomy 
* PSYC -	Psychology 
* SOCI -	Social Sciences 
* VETE -	Veterinary 
* MULT -	Multidisciplinary 

# Create dataframe with publications for each subject

In [4]:
pub_subjects = pd.DataFrame()
#subjects codes
subjects = ["AGRI","ARTS","BIOC","BUSI","CENG","CHEM","COMP","DECI","DENT",
"EART","ECON","ENER","ENGI","ENVI","HEAL","IMMU","MATE","MATH",
"MEDI","NEUR","NURS","PHAR","PHYS","PSYC","SOCI","VETE","MULT"]

for subject in subjects:
    pub_subjects.loc[subject, "publications"] = pub_number("Artificial Intelligence", subject, "1950-2017")

In [5]:
pub_subjects["topics"] = ["Agricultural and Biological Sciences ","Arts and Humanities ",
    "Biochemistry, Genetics and Molecular Biology ",
    "Business, Management and Accounting ","Chemical Engineering ",
    "Chemistry ","Computer Science ","Decision Sciences ","Dentistry ",
    "Earth and Planetary Sciences ","Economics, Econometrics and Finance ",
    "Energy ","Engineering ","Environmental Science ","Health Professions ",
    "Immunology and Microbiology ","Materials Science ","Mathematics ","Medicine ",
    "Neuroscience ","Nursing ","Pharmacology, Toxicology and Pharmaceutics ",
    "Physics and Astronomy ","Psychology ","Social Sciences ","Veterinary ",
    "Multidisciplinary"]

In [6]:
pub_subjects["publications"] = pub_subjects["publications"].astype(int)

## Export/Import dataframe

In [266]:
#pub_subjects.to_csv(path + "pub_subjects.csv")
pub_subjects = pd.read_csv(path + "pub_subjects.csv")
pub_subjects_sorted = pub_subjects.sort_values("publications", ascending = True)

# Plot amount of publications for different subjects
Additional styling is done on plot.ly's web interface

In [270]:
x_print = pub_subjects_sorted["publications"].head(24)
y_print = pub_subjects_sorted["topics"].head(24)

data = [go.Bar(
    x = x_print,
    y = y_print,
    text = str(y_print),
    orientation = "h",
    marker = dict(
        color = "rgb(2, 184, 117)") 
        #looks like plot.ly just updated custom colors as a pro feature. 
        #If you want to save your plot in the cloud, remove this line and use basic colors.
        )
       ]
layout = go.Layout(
    title = "Number of publications on AI applied to different subjects",
    font = dict(family = "Droid Serif"),
    annotations=[dict(
            x=xi,
            y=yi,
            text=str(xi),
            xanchor='left',
            yanchor='center',
            showarrow=False,
            font = dict(size = 12),
        ) for xi, yi in zip(x_print, y_print)
            ],
    xaxis = dict(
        title = "publications",
        anchor = "free",
        position = 1,
        showgrid = False,
        showticklabels = False,
    )
    )
fig = go.Figure(data = data, layout = layout)
iplot(fig)

# Create timeline dataframe

In [140]:
#List of topics I chose
topics = ["Artificial Intelligence", "Deep Learning", "Support Vector Machine", "Neural Networks",
          "Clustering", "Data Mining", "Speech recognition", "Image recognition", "Recommender System"]

In [12]:
timelines = pd.DataFrame(index = pd.Series(range(1950, 2017)))
for topic in topics:
    timelines[topic] = pub_timeline(topic + " Artificial Intelligence", "", 1950, 2017)

## Export/Import dataframe

In [257]:
#timelines.to_csv(path + "timelines.csv")
timelines = pd.read_csv(path + "timelines.csv", index_col = 0)

In [None]:
#Since 2016 isn't finished, it's misleading because it suggests a decrease in publications
timelines.drop(2016, inplace = True)

In [274]:
#Creates a dataframe with the cumulative amount of publications available for each year
timelines_cumul = timelines.cumsum(axis = 0)

# Plotting timeline and timeline_cumul dataframe

In [282]:
data = []
for topic in topics:
    data.append(go.Scatter(
        x = timelines.index,
        y = timelines[topic],
        name = topic
    ))
layout = go.Layout(
    title = "Evolution of scientific publications on different AI topics",
    font = dict(family = "Droid Serif"),
    yaxis = dict(title = "Number of publications"),
    xaxis = dict(title = "Year")
    )

fig = go.Figure(data = data, layout = layout)
iplot(fig)

In [264]:
data = []
for topic in topics:
    data.append(go.Scatter(
        x = timelines_cumul.index,
        y = timelines_cumul[topic],
        name = topic
    ))
layout = go.Layout(
    title = "Cumulative amount of scientific publications on different AI topics",
    font = dict(family = "Droid Serif"),
    yaxis = dict(title = "Number of publications"),
    xaxis = dict(title = "Year")
    )

fig = go.Figure(data = data, layout = layout)
iplot(fig)

# Fitting of timeline and timeline_cumul with logistic function

To evaluate the trend of research we'll try to see the fit with a logistic function, being often used to illustrate the progress of the diffusion of an innovation through its life cycle [(Wikipedia)](https://en.wikipedia.org/wiki/Logistic_function#In_economics_and_sociology:_diffusion_of_innovations)

In [251]:
x = preprocessing.minmax_scale(timelines.loc[1980:, :].index.values)
y = preprocessing.minmax_scale(timelines.loc[1980:, "Artificial Intelligence"].values)

def log_func(x, M, k, x_0):
    return M / (1 + np.exp(-k * (x-x_0)))

popt, pcov = curve_fit(log_func, x, y)


Data with input dtype int64 was converted to float64.


Data with input dtype int64 was converted to float64.



In [291]:
data = [
    go.Scatter(
        x = np.linspace(0,1.5)*(2015-1980) + 1980,
        y = log_func(np.linspace(0, 1.5), *popt) * max(timelines.loc[1980:,"Artificial Intelligence"].values),
        name = "Logistic function fitting",
        line =dict(color = "rgb(2, 184, 117)")
        ),
    go.Scatter(
        x = x * (2015-1980) + 1980,
        y = y* max(timelines.loc[1980:,"Artificial Intelligence"].values),
        name = "AI research normalized",
        line = dict(color = "rgb(0, 0, 0)")
    )
]

layout = go.Layout(
    title = "AI research trends VS logistic function fitting",
    font = dict(family = "Droid Serif"),
    yaxis = dict(
    title = "Number of publications per year"
    ),
    xaxis = dict(
    title = "Year"
    )
)

fig = go.Figure(data = data, layout = layout)
iplot(fig)

In [244]:
x_cumul = preprocessing.minmax_scale(timelines_cumul.loc[1980:,:].index.values)
y_cumul = preprocessing.minmax_scale(timelines_cumul.loc[1980:,"Artificial Intelligence"].values)

popt_cumul, pcov_cumul = curve_fit(log_func, x_cumul, y_cumul)


Data with input dtype int64 was converted to float64.


Data with input dtype int64 was converted to float64.



In [288]:
data = [
    go.Scatter(
        x = np.linspace(0,1.8)*(2015-1980) + 1980,
        y = log_func(np.linspace(0,1.8), *popt_cumul)*max(timelines_cumul.loc[1980:,"Artificial Intelligence"].values),
        name = "Logistic function fitting",
        line = dict(color = "rgb(2, 184, 117)")
        ),
    go.Scatter(
        x = x_cumul*(2015-1980) + 1980,
        y = y_cumul*max(timelines_cumul.loc[1980:,"Artificial Intelligence"].values),
        name = "AI research normalized",
        line = dict(color = "rgb(0, 0, 0)")
    )
]

layout = go.Layout(
    title = "Cumulated AI research VS logistic function fitting",
    font = dict(family = "Droid Serif"),
    yaxis = dict(
    title = "Cumulated Number of publications"
    ),
    xaxis = dict(
    title = "Year"
    )
)

fig = go.Figure(data = data, layout = layout)
iplot(fig)

# Get most active nations

Returns a dataframe with the amount of publications per country

In [3]:
nations_count = pd.DataFrame()

query = "Artificial+Intelligence"
nations = []
n_results = pub_number(query, "", "1950-2017")

for start in range(0,n_results,25):
    try:
        url = ("http://api.elsevier.com/content/search/scopus?apiKey=" + apikey + 
            "&query=Artificial+Intelligence+" + query +
            "&subj=ener" +
            "&start=" + str(start))
        df = pd.read_json(url)
        results = df.loc["entry", "search-results"]
    except urlreq.HTTPError:
        print("Stopped at " + str(start))
        break

    for i in range(0, len(results)):
        if "affiliation" in results[i]:
            nations.append(results[i]["affiliation"][0]["affiliation-country"])

for nation in pd.Series(nations).unique():
    nations_count = nations_count.append({
            "nation" : nation,
            "publications" : nations.count(nation)
        }, ignore_index = True)
    
nations_count = nations_count.groupby("nation").sum()
nations_count.fillna(0,inplace = True)

Stopped at 5000


## Export/Import dataframe

In [5]:
#nations_count.to_csv(path + "nations_count.csv")
nations_count.read_csv(path + "nations_count.csv")

In [275]:
#Get 10 most active countries
nations_count.sort_values("publications", ascending = False).head(10)

Unnamed: 0_level_0,publications
nation,Unnamed: 1_level_1
China,783.0
India,685.0
United States,436.0
Iran,403.0
Malaysia,167.0
Italy,158.0
Spain,132.0
United Kingdom,122.0
Canada,111.0
Japan,105.0


# Plot a map of AI research activity

In [292]:
#for column in nations_count.columns.values:
data = [go.Scattergeo(
        locationmode = "country names",
        locations = nations_count.index,
        marker = dict(
            size = nations_count["publications"]/nations_count["publications"].sum()*1000,
            color = "rgb(2, 184, 117)",
            line = dict(width = 0)
        )
    )]

layout = go.Layout(
    title = "World research on AI",
    font = dict(family = "Droid Serif"),
    geo = dict(
        resolution = 50,
        showframe = False,
        showcoastlines = True,
        showland = True,
        landcolor = "rgb(229, 229, 229)",
        countrycolor = "rgb(255, 255, 255)" ,
        coastlinecolor = "rgb(255, 255, 255)",
        projection = dict(
            type = 'Mercator'
        ),
        domain = dict(
            x = [ 0, 1 ],
            y = [ 0, 1 ]
        )
    ),
)

fig = go.Figure(layout=layout, data=data)
iplot(fig)