# Information Retrieval - Artificial Intelligence 
This notebook retrieves abstracts relevant to AI and then uses topic modeling to analyze the chosen abstracts.  Three info retrieval techniques are used: Literal Term Matching, TF-IDF, and Latent Semantic Indexing.  These are linear algebra techniques.  
We use the Scikit-Learn library.

In [8]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time
import seaborn as sns

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

from gensim.models.phrases import Phrases, Phraser

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [9]:
# pull in entire dataframe

df = pd.read_pickle("/home/scb8kw/git/dspg21RnD/data/dspg21RnD/smaller-final-dataset.pkl")


df.reset_index(inplace = True)
#df.rename(columns={'index':'original index'}, inplace=True)

In [10]:
df.head()

Unnamed: 0,index,original index,PROJECT_ID,ABSTRACT,FY,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,...,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,nchar,final_frqwds_removed
0,0,17608,152242,The multiprotein complex y-secretase proteolyt...,2008,Active Sites; Affect; Alzheimer's Disease; Amy...,STRUCTURE OF SIGNAL PEPTIDE PEPTIDASE,HHS,NIH,,...,,,93.866,2008,3483.0,,1,1,1402,"[multiprotein, y_secretase, proteolytically_cl..."
1,1,111864,190316,DESCRIPTION (provided by applicant): The Kis...,2008,Affect; Animal Model; Axon; Behavior; Behavior...,ROLE OF KISS1 NEURONS IN THE SEASONAL AND CIRC...,HHS,NIH,,...,,,93.865,2008,39175.0,,1,1,2553,"[kissl, gene, encode, peptide, kisspeptin, bin..."
2,2,22052,154213,DESCRIPTION (provided by applicant): The objec...,2008,Agreement; Antibodies; base; Binding; Biochemi...,CARBONIC ANHYDRASE AS A MODEL TO UNDERSTAND DI...,HHS,NIH,,...,,,93.859,2008,49646.0,,1,1,1414,"[biophysical, basis, thermodynamics_kinetic, m..."
3,3,35004,159362,Obesity is the cause of many adverse pregnancy...,2008,African; Analysis of Variance; Asians; Birth; ...,OBESITY ON VAGAL TONE AND HBA1C DURING PREGNANCY,HHS,NIH,,...,,,93.361,2008,20406.0,,1,1,1545,"[obesity, adverse_pregnancyoutcome, great, hea..."
4,4,371628,594482,Local potato advisory groups have expressed in...,2010,cost; Health; interest; Manure; Parasitic nema...,PLANT-PARASITIC NEMATODE MANAGEMENT AS A COMPO...,USDA,NIFA,,...,,,10.203,2010,,,1,1,271,"[local, potato, advisory, express, interest, m..."


In [11]:
# input needed for doc-term matrix creation in Scikit-Learn is one string per document (not a list of strings).  
# Original data 'ABSTRACT' is already in this form, but not the tokens in "final_frqwds_removed"

tokens = df["final_frqwds_removed"]

docs = [] # docs will contain the processed tokens in string form (1 string per abstract)

for abstract in tokens:
    docs.append(" ".join(abstract))
    
docs = pd.Series(docs)

## Functions needed for all info retrieval approaches

In [12]:
# Create query vector 

def create_query(words, terms):
    
    # words: search query words
    # terms: terms in corpus
    
    q = np.zeros(len(terms))  # number of terms

    idx = []
    for word in query_words:
        idx.append(terms.index(word))

    q[idx] = 1
    
    return q

In [13]:
def return_top_abstracts(docs, scores, top_n):
    
    '''
    docs: Series that contains abstract
    scores: scores of abstracts
    top_n: return the top_n abstracts given by idx, if top_n = -1 return all abstracts
    '''
    # sort scores in descending order
    scores_sorted_idx = np.argsort(scores)[::-1]
    
    if top_n == -1:
        n = sum(scores > 0)
        ix = scores_sorted_idx[:n]
    else:
        ix = scores_sorted_idx[:top_n]
    
    print(ix[0:10])
    
    return ix, docs[ix]
    

In [14]:
def create_result_df(abstracts, scores):
    
    df = pd.DataFrame()
    df["abstracts"] = abstracts
    df["scores"] = scores
    
    return df

## Literal Term Matching - Frequency Count Document-Term Matrix

This will return all abstracts in the corpus with exact word matches to the query.  

Results will be return in sorted order of how high the query scores with each abstract. A high score means more occurences of the query words in the abstract.

In [15]:
# Note - we are now using the spaCy stopwords list instead of nltk.  It is more comprehensive.
# ONLY USE THIS FUNCTION if using raw text to search

'''
def create_stopwords():
      
    """ creates list of stopwords. stop words include the general English list and any additional we see sneaking 
    through.  """
    
    spacy_stop_words = STOP_WORDS

    # more stop words that do not add meaning to topics
    additional_stopwords = {'addition', 'specifically', 'similar','including', 'particular', 
                            'furthermore','include', 'includes','overall', 'finally', 'specific', 
                            'additional'} 
           
    sw = spacy_stop_words.union(additional_stopwords)
    
    return sw
    
'''

'\ndef create_stopwords():\n      \n    """ creates list of stopwords. stop words include the general English list and any additional we see sneaking \n    through.  """\n    \n    spacy_stop_words = STOP_WORDS\n\n    # more stop words that do not add meaning to topics\n    additional_stopwords = {\'addition\', \'specifically\', \'similar\',\'including\', \'particular\', \n                            \'furthermore\',\'include\', \'includes\',\'overall\', \'finally\', \'specific\', \n                            \'additional\'} \n           \n    sw = spacy_stop_words.union(additional_stopwords)\n    \n    return sw\n    \n'

#### Create Document-Term Matrix

In [16]:
# Create document-term matrix based on count frequencies

#stop_words = create_stopwords()

vectorizer = CountVectorizer(lowercase=True, min_df=1)
doc_term_matrix = vectorizer.fit_transform(docs)

In [17]:
doc_term_matrix.shape

(690814, 1277822)

In [18]:
terms = vectorizer.get_feature_names()

#### Create Query Words - list the search terms

A query is just a list of words to search for in the corpus.  We will use the same query for all three info retrieval techniques.

In [19]:
'artificial_intelligence' in terms

True

In [20]:
# ONLY USE FOR RAW TEXT STRING MATCHING

# tokens with - or strange symbol between words

'''
count = 0
idx = []

for ix, abstract in enumerate(df['ABSTRACT']):
    if 'artificial intelligence' in abstract.lower(): 
        count = count + 1
        idx.append(ix)
    elif 'artificialintelligence' in abstract.lower(): 
        count = count + 1
        idx.append(ix)
    elif 'artificially intelligent' in abstract.lower(): 
        count = count + 1
        idx.append(ix)
    #else: 
        # do nothing

print(count)

'''

"\ncount = 0\nidx = []\n\nfor ix, abstract in enumerate(df['ABSTRACT']):\n    if 'artificial intelligence' in abstract.lower(): \n        count = count + 1\n        idx.append(ix)\n    elif 'artificialintelligence' in abstract.lower(): \n        count = count + 1\n        idx.append(ix)\n    elif 'artificially intelligent' in abstract.lower(): \n        count = count + 1\n        idx.append(ix)\n    #else: \n        # do nothing\n\nprint(count)\n\n"

In [21]:
# CHANGE QUERY WORDS HERE 

# these words were found by analyzing all tokens with some form of 'artificial intelligence'
query_words = ['artificial_intelligence', 'artificial_intelligence_ai', 
               'artificial_intelligence_machine_learning', 'artificialintelligence', 'artificially_intelligent',
               'artificial_intelligence_aaai', 'artificial_intelligence_ijcai', 'artificialintelligence_ai',
               'artificialintelligent'
              ] 

#'ai', 'artificial', 'intelligence' 
              
q = create_query(query_words, terms)

In [22]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

f_scores = doc_term_matrix.dot(q)

In [23]:
sum(f_scores >0)  # how many abstracts include at least one of the query words

# some are being left off from raw counts

971

In [24]:
# sort scores in descending order

f_scores_sorted = np.sort(f_scores)[::-1]
f_scores_sorted[:10]

array([6., 5., 5., 4., 4., 4., 4., 4., 4., 3.])

In [25]:
f_idx, f_top_abstracts = return_top_abstracts(docs, f_scores, -1)  # CHANGE NUMBER OF TOP DOCS RETURNED

[493912  83585  10300  19512 293012 689629 637044 689591 429296 292485]


In [26]:
f_top_abstracts

493912    unique interdisciplinary team computer scienti...
83585     eager award education collaboration kindle_mat...
10300     graduate student attend workshop organize conj...
19512     aaai artificial_intelligence interactive digit...
293012    grant participation undergraduate student hold...
                                ...                        
363997    investigation topic temporal relation extracti...
32191     broader commercial small_business_innovation s...
357956    undergraduate reu site advance student technol...
145096    yale spore skin cancer skin cancer basal cell ...
122566    alignment rfa da 19_008 seek great_lakes node ...
Length: 971, dtype: object

In [27]:
f_top_abstracts.iloc[0]

'unique interdisciplinary team computer scientist scientist ornithologist manager programmer network machine_learning human observational capacity explore synergy mechanical computation human computation human computer learning network broad scale citizen science network wide applicability variety domain network active learning feedback_loop machine human dramatically continually effectiveness network human computer learning network leverage contribution broad recruitment human observer artificial_intelligence algorithm total computational power far exceed sum individual highly successful ebird citizen science testbed human computer learning network bird engage global network volunteer submit million bird observation annually central database fundamental challenge citizen science error identification classification object quantify difference individual observer spatial bias prevalent citizen science challenge build advance artificial_intelligence opportunity generation account enormous

In [28]:
df['ABSTRACT'][493912]

"A unique interdisciplinary team of computer scientists, information scientists, ornithologists, project managers, and programmers will develop a novel network between machine learning methods and human observational capacity to explore the synergies between mechanical computation and human computation. This is called a Human/Computer Learning Network, and while the focus is to improve data quality in broad-scale citizen-science projects, the network has the potential for wide applicability in a variety of complex problem domains. The core of this network is an active learning feedback loop between machines and humans that dramatically improves the quality of both, and thereby continually improves the effectiveness of the network as a whole. The Human/Computer Learning Network will leverage the contributions of broad recruitment of human observers and process their contributed data with artificial intelligence algorithms leading to a total computational power far exceeding the sum of t

## TF-IDF Document-Term Matrix

This approach is similar to Literal Term Matching using frequency counts in the document-term matrix.  However, instead of using frequency counts, the entries of the document-term matrix are weighted using TF-IDF.

In [29]:
# Find doc-term matrix using TF-IDF weighting

tf_idf_vectorizer = TfidfVectorizer(lowercase=True, min_df=1)
tf_idf = tf_idf_vectorizer.fit_transform(docs)

In [30]:
tf_idf_terms = tf_idf_vectorizer.get_feature_names()  # these terms are the same as the terms created from the 
                                                      # frequency count document-term matrix, so we do not need to
                                                      # recreate the query vector.

In [31]:
tf_idf_terms == terms

True

In [32]:
# calculate the score for each document against the query. Docs with more occurences of the query words 
# will score higher

tf_idf_scores = tf_idf.dot(q)

In [33]:
sum(tf_idf_scores >0)   # how many abstracts include at least one of the query words

971

In [34]:
# sort scores in descending order

tf_idf_scores_sorted = np.sort(tf_idf_scores)[::-1]
tf_idf_scores_sorted[:10]

array([0.67110913, 0.50600405, 0.44719966, 0.42104089, 0.40140886,
       0.39908403, 0.37190372, 0.37129172, 0.37005479, 0.3416502 ])

In [35]:
tfidf_idx, tfidf_top_abstracts = return_top_abstracts(docs, tf_idf_scores, -1)  # CHANGE NUMBER OF TOP DOCS RETURNED

[235101 293012  10300 209895 493912 366791 581817 232572 108705 531638]


In [36]:
tfidf_top_abstracts

235101    phd student artificial_intelligence opportunit...
293012    grant participation undergraduate student hold...
10300     graduate student attend workshop organize conj...
209895    imperative environmental societal sustainabili...
493912    unique interdisciplinary team computer scienti...
                                ...                        
168231    animal human means better human barrier progre...
688857    generate humanity past exceed create human his...
114280    elderly patient dementia present massive care ...
638620    scene ie recognize object human action event i...
237025    background people multiple_sclerosis_ms chroni...
Length: 971, dtype: object

In [37]:
tfidf_top_abstracts.iloc[0]

'phd student artificial_intelligence opportunity share interact senior learn sub ai mentor publication career opportunity accomplish partially travel cost phd student attend international joint conference artificial_intelligence_ijcai premier international conference artificial_intelligence conference 2017 hold melbourne_australia attract international crowd academics industry worker entrepreneur funding agency leader'

In [38]:
df['ABSTRACT'][235101]

'This proposal will support US-based Ph.D. students working in artificial intelligence the opportunity to share their knowledge and interact with each other and more senior researchers, to learn about different sub-fields within AI, and to be mentored in research, publication, and career opportunities. This goal will be accomplished by partially supporting the travel costs for US-based Ph.D. students to attend the International Joint Conference on Artificial Intelligence (IJCAI), which is one of the premier international conferences on research in artificial intelligence. The conference, which in 2017 will be held in Melbourne, Australia attracts an international crowd that includes academics, industry workers, entrepreneurs, and funding agency leaders.'

## Latent Semantic Indexing (LSI) Approach

LSI Uses the TF-IDF matrix.  LSI is a tecnique that utilizes a truncated Singular Value Decomposition of the document-term matrix.  Basically, LSI still returns relevant documents to the query; however some of the documents returned may not include the exact search terms!  LSI is finding the latent or hidden relationships in the terms.  

In [39]:
# Find the Truncated SVD of the TF-IDF matrix

lsa = TruncatedSVD(n_components=500, random_state=1)  # CHANGE THE NUMBER OF COMPONENTS - NOTE: MORE COMPONENTS 
                                                      # GIVES YOU A MORE ACCURATE APPROXIMATION OF THE DOC-TERM 
                                                      # MATRIX, BUT IS ALSO MORE EXPENSIVE AND MAY NOT LEAD TO THE 
                                                      # BEST INFO RETRIEVAL RESULTS.
USigma = lsa.fit_transform(tf_idf)
Vtrans = lsa.components_

KeyboardInterrupt: 

In [None]:
# transform query to be in same space as documents

q = q.reshape(1,-1)
qhat = lsa.transform(q)

In [None]:
print(qhat.shape)
print(USigma.shape)
print(Vtrans.shape)

In [None]:
lsa_scores = pairwise_distances(qhat, USigma, metric='cosine', n_jobs=7)  # CHANGE N_JOBS TO BE NUMBER OF CORES - 1

In [None]:
lsa_scores.shape

In [None]:
lsa_scores

In [None]:
lsa_scores[0]

In [None]:
sum(lsa_scores[0] > 0)  # how many abstracts scored above 0

In [None]:
# sort scores in descending order

lsa_scores_sorted = np.sort(lsa_scores[0])[::-1]
lsa_scores_sorted[:10]

In [None]:
lsa_idx, lsa_top_abstracts = return_top_abstracts(docs, lsa_scores[0], 50)  # CHANGE NUMBER OF TOP DOCS RETURNED

In [None]:
lsa_top_abstracts

In [None]:
lsa_top_abstracts.iloc[41]

In [None]:
df['ABSTRACT'][344482]

In [None]:
# I don't think LSI is giving us AI results - I spot checked 5-10 of these.  

## Create pandemics corpus

We use the results of our three information retrieval techniques to create a new, smaller corpus that only contains abstracts relevant to the query. 

In [None]:
docs_ix = np.concatenate([f_idx, tfidf_idx]) #lsa_idx])

In [None]:
docs_idx = np.unique(docs_ix)

In [None]:
docs_idx.shape

**create case-study corpus**

In [None]:
ai_corpus = df.loc[docs_idx, :]

In [None]:
ai_corpus.shape

In [None]:
# save or read corpus

#ai_corpus.to_pickle("./ai_corpus.pkl")
#ai_corpus = pd.read_pickle("ai_corpus.pkl")

In [None]:
lim_docs = ai_corpus["final_frqwds_removed"]

In [None]:
# input needed for LDA, NMF (all from Scikit-Learn) is one string per document (not a list of strings)

text = []

for token_list in lim_docs:
    text.append(" ".join(token_list))

In [None]:
len(lim_docs)

## Topic Modeling with relevant AI abstracts

In [None]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

In [None]:
# Create a TF-IDF document-term matrix for the AI corpus 

# TRY DIFFERENT PARAMETERS IN THE TF-IDF DOC-TERM MATRIX SET-UP
nmf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=3, lowercase=True) #, max_features=int(len(lim_docs)/2))

# by default TfidfVectorizer has l2 normalization for rows: 
# from Scikit Learn documentation: Each output row will have unit norm, either: * ‘l2’: Sum of squares of vector 
# elements is 1. The cosine similarity between two vectors is their dot product when l2 norm has been applied.

nmf_tf_idf = nmf_vectorizer.fit_transform(text)

In [None]:
nmf_tf_idf.shape

In [None]:
AI_terms = nmf_vectorizer.get_feature_names()

In [None]:
AI_terms[2830:2850]

In [None]:
# topic modeling with NMF

nmf_model = NMF(n_components=30, random_state=1)  # TRY DIFFERENT NUMBERS OF TOPICS
W = nmf_model.fit_transform(nmf_tf_idf)
H = nmf_model.components_

In [None]:
print_topics(nmf_model, nmf_vectorizer, 10)

### properties of AI corpus

In [None]:
ai_corpus.head()

In [None]:
# project terms - worth looking into??

ai_corpus['PROJECT_TERMS'].iloc[500] # does not have artificial intelligence as key term

In [None]:
ai_corpus['ABSTRACT'].iloc[500]

In [None]:
# plot number of abstracts by department

y = ai_corpus["DEPARTMENT"].value_counts()
y.plot(kind = 'bar')
plt.ylabel('Number of Abstracts')
plt.title('Abstract Count by Department');

print(y)

In [None]:
# plot number of abstracts by project start year

# extract year from project start date

def getYear(a):   
    a = str(a)
    if a.find("/"):
        splitdate = a.split("/")
        if len(splitdate) == 3:
            a = splitdate[2]
        else:
            a = splitdate[0]
    year = str(a)
    return year

ai_corpus['START_YEAR'] = ai_corpus['PROJECT_START_DATE'].apply(getYear)

In [None]:
y = ai_corpus["START_YEAR"].value_counts().sort_index()
y.plot(kind = 'bar')
plt.ylabel('Number of Abstracts')
plt.title('Abstract Count by Project Start Year');

print(y)

In [None]:
# plot dollars by year

dollars_by_year = ai_corpus.groupby(by = ['START_YEAR']).sum()
dollars_by_year["FY_TOTAL_COST"] = dollars_by_year["FY_TOTAL_COST"]/1000000

In [None]:
dollars_by_year.plot.bar(y = "FY_TOTAL_COST")
plt.ylabel('Dollars (millions)')
plt.title('Dollars Spent by Project Start Year');


In [None]:
# plot dollars by year & department

dollars_by_deptNyear = ai_corpus.groupby(by = ['START_YEAR','DEPARTMENT']).sum()
dollars_by_deptNyear["FY_TOTAL_COST"] = dollars_by_deptNyear["FY_TOTAL_COST"]/1000000

dollars_by_deptNyear

In [None]:
dollars_by_deptNyear.plot.bar(y = "FY_TOTAL_COST", figsize=(15,5))
plt.ylabel('Dollars (millions)')
plt.title('Dollars Spent by Project Start Year and Department');

In [None]:
dollars_by_deptNyear = dollars_by_deptNyear.reset_index()

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.barplot(ax=ax, x="START_YEAR", y="FY_TOTAL_COST", hue="DEPARTMENT", data=dollars_by_deptNyear)

In [None]:
# next step - look at documents containing machine learning applications (such as topic 21)

topic_docs = W[:, 21]

In [None]:
sum(topic_docs > 0) 

In [None]:
max_score = max(topic_docs)
max_score

In [None]:
topic_docs[0:20]

In [None]:
print(lim_docs.iloc[2])

In [None]:
idx = np.where(topic_docs == max_score)

In [None]:
idx

In [None]:
idx = np.where(topic_docs > 0.3)
idx

In [None]:
topic_docs[10:30]

In [None]:
print(lim_docs.iloc[745]) # psychosis topic with AI component

In [None]:
ai_corpus["ABSTRACT"].iloc[759]