## 1. Imports

This loads a custom module I made for processing web of science data.

In [106]:
import biblionet as bn
import numpy as np
import json
import collections as c
from sklearn.decomposition import NMF

## 2. Process Raw Web of Science Data

This processes the data and the metadata, and it does standard data cleaning. You can skip this section and load the pre-processed data in the next section, if you want.

In [92]:
# point to where you have stored the data
# The data is tab separated WOS files, stored utf-8, with 500 records per file
path = "C:/Users/LocalAdmin/Operations/Programming/data/TFSC"
# load the data
corpus = bn.create_corpus(path)
# create the metadata 
metadata = bn.create_metadata(path)

In [93]:
# annotate the corpus with cleaned and processed text
# this creates new fields (F1, F2, F3)
cites = bn.annotate("F1",corpus)
orgs = bn.annotate("F2",cites)
content = bn.annotate("F3",orgs)

In [94]:
# choose the top n words for indexing out of the cleaned content field
content_dict = bn.make_index(1000,"F3",content)
# index the data
index = bn.index_corpus("F3",content_dict,content)

In [95]:
# count how many articles you have indexed
n=len(index)

In [102]:
with open('TFSCcontent.json', 'w') as fp:
    json.dump(index, fp)

## 3. Create the Matrix

Now actually create the matrix. This is necessary for statistical processing. The data in the index is a dictionary and not in an appropriate format for statistical analysis.

In [103]:
# load the data from file if you skipped the previous section
# point to the data if not local to the session
with open('TFSCcontent.json', 'r') as fp:
    index = json.load(fp)

In [98]:
# dimension the matrix
# this stores a count vector for each word in the index in each row
# this is repeated for all n articles
X = np.zeros((n,1000))

# now fill the count matrix
i = 0
for key in index:
    array = index[key]
    for el in array:
           X[i,el]=1
    i+=1

Here we include all index records, regardless of the amount of indexed content. This means that longer records receive more detail in the modelling process.

In [100]:
# Perform the matrix decomposition
# This is equivalent to topic modelling, but a little better behaved on correlated data
model = NMF(n_components=9, init='random', random_state=0)

# W is an assignment vector, assigning each article to a mix of nine components
W = model.fit_transform(X)
# H is a topic model, assigning each index word to one or more of the nine topics
H = model.components_

## 4. Process the Data to Create Citation Lists

This step uses the pre-processed Web of Science data (index), as well as the assignment matrix of the topic model (W). The objective is to count all the citations in one of the core topics in TFSC. The topics are in no specific order, but I know from previous explorations using word clouds that we are interested in topic 6.

In [107]:
# create a place to store the counted citations
cntr = c.Counter()

# go through the articles one by one
for n,key in enumerate(index):
    arec = content[key]
    # get the citation field from each record
    cites = arec["CR"]
    # find the associated weight for each topic in the assignment matrix
    w = W[n,6]
    cite_list = cites.split("; ")
    # iterate through each citation from the article
    for cite in cite_list:
        # store it with the appropriate weight.
        cntr[cite]+=w

In [109]:
# Show the most common entries
cntr.most_common(26)
# We need to include one extra element to get 25 records. 
# This is because one of the records is a blank field; this escaped data cleaning. 

[('DOSI G, 1982, RES POLICY, V11, P147, DOI 10.1016/0048-7333(82)90016-6',
  14.750214329259402),
 ('COHEN WM, 1990, ADMIN SCI QUART, V35, P128, DOI 10.2307/2393553',
  13.169027689312447),
 ('Daim TU, 2006, TECHNOL FORECAST SOC, V73, P981, DOI 10.1016/j.techfore.2006.04.004',
  13.048041938010158),
 ('Nelson R. R., 1982, EVOLUTIONARY THEORY', 12.857677458886538),
 ('EISENHARDT KM, 1989, ACAD MANAGE REV, V14, P532, DOI 10.2307/258557',
  12.13388166635403),
 ('Phaal R, 2004, TECHNOL FORECAST SOC, V71, P5, DOI 10.1016/S0040-1625(03)00072-6',
  11.730243872873515),
 ('BASS FM, 1969, MANAGE SCI, V15, P215, DOI 10.1287/mnsc.15.5.215',
  11.36911269184459),
 ('Kostoff RN, 2001, IEEE T ENG MANAGE, V48, P132, DOI 10.1109/17.922473',
  10.346945848294201),
 ('Hekkert MP, 2007, TECHNOL FORECAST SOC, V74, P413, DOI 10.1016/j.techfore.2006.03.002',
  8.902654613632873),
 ('Geels FW, 2002, RES POLICY, V31, P1257, DOI 10.1016/S0048-7333(02)00062-8',
  8.585792751771542),
 ('ANDERSON P, 1990, ADMIN 