In [55]:
import json
import csv
import codecs
import pandas as pd
from elasticsearch import helpers, Elasticsearch

In [56]:
#es = Elasticsearch(['http://localhost:9200'], http_auth=('', ''))
es = Elasticsearch(['http://localhost:9200'])
es.ping()

True

In [57]:
#concat data
df = pd.concat(
    map(pd.read_csv, ["data/ACM_IEEE.csv", "data/Clarivate.csv","data/ScienceDirect.csv","data/Scopus.csv"]), ignore_index=True)

In [58]:
docs = []
for i in range(0,df.shape[0]):
      docs.append({
        "Document Type": df.iloc[i]['Document Type'],
        "Authors": df.iloc[i]['Authors'],
        "Title": df.iloc[i]['Title'],
        "Source": df.iloc[i]['Source'],
        "Year": df.iloc[i]['Year'],
        "Link": df.iloc[i]['Link'],
        "Abstract": df.iloc[i]['Abstract'],
        "Keywords": df.iloc[i]['Keywords'],
    })

# Mapping dynamically generated by Elasticsearch 

In [35]:
es.indices.create(index="docs_slr_no_mapping")

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'docs_slr_no_mapping'}

In [36]:
helpers.bulk(es,index='docs_slr_no_mapping', actions=docs)

(2823, [])

In [39]:
query_body = {
  "size":10000,
  "query": {
    "match": {
      "Abstract": {
        "query":"retinopathy and CNN"
      }
    }
  }
}

In [38]:
es.search(index="docs_slr_no_mapping", body=query_body)

{'took': 1180,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2727, 'relation': 'eq'},
  'max_score': 4.532532,
  'hits': [{'_index': 'docs_slr_no_mapping',
    '_type': '_doc',
    '_id': 'DJWCCIYBctSqsvwFYSBa',
    '_score': 4.532532,
    '_source': {'Document Type': 'Article',
     'Authors': 'Thomas, G. Arun Sampaul; Robinson, Y. Harold; Julie, E. Golden; Shanmuganathan, Vimal; Rho, Seungmin; Nam, Yunyoung',
     'Title': 'Intelligent Prediction Approach for Diabetic Retinopathy Using Deep Learning Based Convolutional Neural Networks Algorithm by Means of Retina Photographs',
     'Source': 'cmc-computers materials & continua',
     'Year': 2021,
     'Link': 'Not found',
     'Abstract': 'Retinopathy is a human eye disease that causes changes in retinal blood vessels that leads to bleed, leak fluid and vision impairment. Symptoms of retinopathy are blurred vision, changes in color perception, red spots, and e

# Mapping with standard analyzer

In [126]:
mapping = {
    "settings": {
        "number_of_shards": 2,
        "number_of_replicas": 1
    },
    "mappings": {
    "properties" : {
        "Document Type" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "Authors" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "Title" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "Source" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "Year" : {
          "type" : "long"
        },
        "Link" : {
          "type" : "text"
        },
        "Abstract" : {
          "type" : "text"
        },
        "Keywords" : {
          "type" : "text"
        }
      }
    }
}

In [42]:
es.indices.create(index="docs_slr_old_mapping",body=mapping)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'docs_slr_old_mapping'}

In [43]:
helpers.bulk(es,index='docs_slr_old_mapping', actions=docs)

(2823, [])

In [82]:
query_body =  {
    "query" : {
        "match_all" : {}                        
    },
    "size": 0,
    "aggs": {
    "group_by_state": {
      "terms": {
        "field": "Source.keyword"
      }
    }
  }
}

In [83]:
es.search(index="docs_slr_old_mapping", body=query_body)

{'took': 142,
 'timed_out': False,
 '_shards': {'total': 2, 'successful': 2, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2823, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'group_by_state': {'doc_count_error_upper_bound': 9,
   'sum_other_doc_count': 1060,
   'buckets': [{'key': 'Association for Computing Machinery',
     'doc_count': 1066},
    {'key': 'IEEE', 'doc_count': 502},
    {'key': 'Association for Computing Machinery and Morgan & Claypool',
     'doc_count': 40},
    {'key': 'ieee access', 'doc_count': 32},
    {'key': 'IEEE Computer Society Press', 'doc_count': 26},
    {'key': 'IEEE Press', 'doc_count': 26},
    {'key': 'IEEE Access', 'doc_count': 24},
    {'key': 'Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)',
     'doc_count': 19},
    {'key': 'Computers in Biology and Medicine', 'doc_count': 14},
    {'key': 'JMLR.org', 'doc_count': 14}]}}}

# Customized mapping

In [129]:
#mapping customized using new analyzers
mapping_custom = {
    "settings" : {
       "number_of_shards":2,
       "number_of_replicas":1,

    "analysis":{
   "filter":{
      "english_stop":{
         "type":"stop",
         "stopwords_path":"stopwords.txt" #copy the txt file to the corresponding ES directory
      },
      "english_keywords":{
         "type":"keyword_marker",
         "keywords_path":"keywords.txt" #copy the txt file to the corresponding ES directory, D:/Elastic/elasticsearch/config/keywordmarker-words.txt
      },
      "english_stemmer":{
         "type":"stemmer",
         "language":"english"
      },
      "english_shingle":{
         "type":"shingle",
         "min_shingle_size":2,
         "max_shingle_size":3
      }
   },
   "analyzer":{
      "rebuilt_english":{
         "tokenizer":"standard",
         "filter":[
            "english_stop",
            "english_keywords",
            "english_shingle",
            "english_stemmer"
         ]
      },
       "stem_noshingle":{
         "tokenizer":"standard",
         "filter":[
            "english_stop",
            "english_keywords",
            "english_stemmer"
         ]
      },
      "shingle_nostem":{
         "tokenizer":"standard",
         "filter":[
            "english_stop",
            "english_shingle"
         ]
      },
      "simple": {
          "tokenizer":"standard",
         "filter":[
            "english_stop"
         ]
      },
   }
}
    },
"mappings": {
    "properties": {
      "Abstract": {
         "type":"text",
         "analyzer":"rebuilt_english",
         "fielddata":"true",
         "fields":{
            "simple":{
               "type":"text",
               "analyzer":"simple",
               "fielddata":"true"
            },
             "shingle_nostem":{
               "type":"text",
               "analyzer":"shingle_nostem",
               "fielddata":"true"
             },
             "stem_noshingle":{
               "type":"text",
               "analyzer":"stem_noshingle",
               "fielddata":"true"
             }
         }
      },
      "Authors": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "Document Type": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "Keywords": {
        "type": "text"
      },
      "Link": {
        "type": "text"
      },
      "Source": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "Title": { "type": "text","analyzer":"rebuilt_english",
         "fielddata":"true",
         "fields":{
            "simple":{
               "type":"text",
               "analyzer":"simple",
               "fielddata":"true"
            },
             "shingle_nostem":{
               "type":"text",
               "analyzer":"shingle_nostem",
               "fielddata":"true"
             },
             "stem_noshingle":{
               "type":"text",
               "analyzer":"stem_noshingle",
               "fielddata":"true"
             },
            "keyword":{
               "type":"keyword",
                "ignore_above": 256
            }
         }
        },
      "Year": {
        "type": "long"
        }
      }
    }
}

In [78]:
es.indices.create(index="docs_slr_customized_mapping", body=mapping_custom)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'docs_slr_customized_mapping'}

In [79]:
helpers.bulk(es,index='docs_slr_customized_mapping', actions=docs)

(2823, [])

In [80]:
query_body = {
  "size":10000,
  "query": {
    "match": {
      "Abstract.shingle_nostem": {
        "query":"retinopathy and CNN"
      }
    }
  }
}

In [81]:
es.search(index="docs_slr_customized_mapping", body=query_body)

{'took': 3621,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1369, 'relation': 'eq'},
  'max_score': 11.037201,
  'hits': [{'_index': 'docs_slr_customized_mapping',
    '_type': '_doc',
    '_id': 'GZWeCYYBctSqsvwFujx_',
    '_score': 11.037201,
    '_source': {'Document Type': 'Article',
     'Authors': 'Lahmiri S.',
     'Title': 'Hybrid deep learning convolutional neural networks and optimal nonlinear support vector machine to detect presence of hemorrhage in retina',
     'Source': 'Biomedical Signal Processing and Control',
     'Year': 2020,
     'Link': 'https://www.scopus.com/inward/record.uri?eid=2-s2.0-85083510927&doi=10.1016%2fj.bspc.2020.101978&partnerID=40&md5=8edfaa038b6945a2747d589bd5970579',
     'Abstract': 'Diabetic retinopathy is a disorder that occurs in retina and it is caused by diabetes mellitus. Millions of people with diabetic retinopathy are expected to experience a loss of vision across

# Running different types of queries across the 3 indices :

### Index 1 : docs_slr_no_mapping

In [84]:
# What are the most common artificial intelligence-based methods for DR detection?

In [85]:
query_body = {
    "query": {
    "bool": {
      "should": [
        {
          "match": {
            "Abstract": {
              "query": "detecting diabetic retinopathy methods",
              "fuzziness": "auto"
            }
          }
        },
        {
          "match": {
            "Abstract": {
              "query": "common diabetic retinopathy detection techniques",
              "fuzziness": "auto"
            }
          }
        }
      ]
    }
  }
}

In [86]:
es.search(index="docs_slr_no_mapping", body=query_body)

{'took': 37,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2438, 'relation': 'eq'},
  'max_score': 15.360741,
  'hits': [{'_index': 'docs_slr_no_mapping',
    '_type': '_doc',
    '_id': 'xJWCCIYBctSqsvwFYR9a',
    '_score': 15.360741,
    '_source': {'Document Type': 'Article',
     'Authors': 'Mohan, N. Jagan; Murugan, R.; Goel, Tripti; Mirjalili, Seyedali; Roy, Parthapratim',
     'Title': 'A novel four-step feature selection technique for diabetic retinopathy grading',
     'Source': 'physical and engineering sciences in medicine',
     'Year': 2021,
     'Link': 'Not found',
     'Abstract': 'Diabetic retinopathy is a microvascular complication of diabetes mellitus that develops over time. Diabetic retinopathy is one of the retinal disorders. Early detection of diabetic retinopathy reduces the chances of permanent vision loss. However, the identification and regular diagnosis of diabetic retinopathy is a tim

In [87]:
# What are the most promising AI methods for diabetic retinopathy detection?

In [88]:
query_body_2 ={
  "query": {
    "bool": {
      "must": [
        { "match": { "Document Type": "research paper" } },
        { "match": { "Abstract": "diabetic retinopathy detection AI" } }
      ],
      "should": [
        { "match": { "Keywords": "promising" } },
        { "match": { "Keywords": "state-of-the-art" } },
        { "match": { "Document Type": "Article" } },
        { "match": { "Document Type": "Journal" } },
        { "match": { "Document Type": "Research Paper" } }  
      ],
        "must_not": [
            {"match": {"link":"Not Found"}}
        ],
      "filter": [
        { "exists": { "field": "Link" } },
        { "range": { "Year": { "gte": "2012"} } }
      ]
    }
  }
}

In [89]:
es.search(index="docs_slr_no_mapping", body=query_body_2)

{'took': 113,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 639, 'relation': 'eq'},
  'max_score': 18.279472,
  'hits': [{'_index': 'docs_slr_no_mapping',
    '_type': '_doc',
    '_id': 'bJWCCIYBctSqsvwFYyZ_',
    '_score': 18.279472,
    '_source': {'Document Type': 'Conference Paper',
     'Authors': 'Lin Z., Guo R., Wang Y., Wu B., Chen T., Wang W., Chen D.Z., Wu J.',
     'Title': 'A framework for identifying diabetic retinopathy based on anti-noise detection and attention-based fusion',
     'Source': 'Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)',
     'Year': 2018,
     'Link': 'https://www.scopus.com/inward/record.uri?eid=2-s2.0-85054076986&doi=10.1007%2f978-3-030-00934-2_9&partnerID=40&md5=c1bdfd1b8c0582a0ed58c63a45d9c9fd',
     'Abstract': 'Automatic diagnosis of diabetic retinopathy (DR) using retinal fundus images

In [90]:
# What are the various Features Extraction Techniques for DR?

In [91]:
query_body_3 = {
  "size": 0,
  "query": {
    "match": {
      "Abstract": "feature extraction technique"
    }
  },
  "aggs": {
    "techniques": {
      "significant_text": {
        "field": "Abstract"
      }
    }
  }
}

In [92]:
es.search(index="docs_slr_no_mapping", body=query_body_3)

{'took': 520,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 940, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'techniques': {'doc_count': 940,
   'bg_count': 2823,
   'buckets': [{'key': 'feature',
     'doc_count': 606,
     'score': 1.2914191942055226,
     'bg_count': 606},
    {'key': 'extraction',
     'doc_count': 342,
     'score': 0.728820733363513,
     'bg_count': 342},
    {'key': 'technique',
     'doc_count': 314,
     'score': 0.6691511996378451,
     'bg_count': 314},
    {'key': 'features',
     'doc_count': 424,
     'score': 0.22149304049963794,
     'bg_count': 854},
    {'key': 'retinopathy',
     'doc_count': 650,
     'score': 0.20960458217717184,
     'bg_count': 1498},
    {'key': 'proposed',
     'doc_count': 534,
     'score': 0.19556996695299314,
     'bg_count': 1193},
    {'key': 'image',
     'doc_count': 503,
     'score': 0.19116005766656613,
     'bg_co

### Index 2 : docs_slr_customized_mapping

In [93]:
#retinipathy AND CNN OR ANN

In [94]:
query_body = {
  "size":1000,
  "query": {
    "match": {
      "Abstract.stem_noshingle": {
        "query":"retinipathy AND CNN OR ANN"
      }
    }
  }
}

In [95]:
es.search(index="docs_slr_customized_mapping", body=query_body)

{'took': 215,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 507, 'relation': 'eq'},
  'max_score': 17.285955,
  'hits': [{'_index': 'docs_slr_customized_mapping',
    '_type': '_doc',
    '_id': '_5WeCYYBctSqsvwFrTa1',
    '_score': 17.285955,
    '_source': {'Document Type': 'Review',
     'Authors': 'Boned-Murillo, Ana; Albertos-Arranz, Henar; Diaz-Barreda, Maria Dolores; Orduna-Hospital, Elvira; Sanchez-Cano, Ana; Ferreras, Antonio; Cuenca, Nicolas; Pinilla, Isabel',
     'Title': 'Optical Coherence Tomography Angiography in Diabetic Patients: A Systematic Review',
     'Source': 'biomedicines',
     'Year': 2022,
     'Link': 'Not found',
     'Abstract': 'Background: Diabetic retinopathy (DR) is the leading cause of legal blindness in the working population in developed countries. Optical coherence tomography (OCT) angiography (OCTA) has risen as an essential tool in the diagnosis and control of diabetic pat

In [97]:
#retinopathy detection using neural network

In [98]:
query_body_2 = {
  "size":1000,
  "query": {
    "match": {
      "Abstract.nostem": {
        "query":"retinopathy detection using neural network"
      }
    }
  }
}

In [99]:
es.search(index="docs_slr_customized_mapping", body=query_body_2)

{'took': 8,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

### Index 3 : docs_slr_old_mapping

In [100]:
#datasets sources AND retinopathy detection

In [102]:
query_body = {
  "size":1000,
  "query": {
    "match": {
      "Title": {
        "query":"datasets sources AND retinopathy detection"
      }
    }
  }
}

In [104]:
es.search(index="docs_slr_old_mapping", body=query_body)

{'took': 264,
 'timed_out': False,
 '_shards': {'total': 2, 'successful': 2, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1643, 'relation': 'eq'},
  'max_score': 8.679737,
  'hits': [{'_index': 'docs_slr_old_mapping',
    '_type': '_doc',
    '_id': 'mpWkCIYBctSqsvwFmCq5',
    '_score': 8.679737,
    '_source': {'Document Type': 'Review',
     'Authors': 'Mateen, Muhammad; Wen, Junhao; Hassan, Mehdi; Nasrullah, Nasrullah; Sun, Song; Hayat, Shaukat',
     'Title': 'Automatic Detection of Diabetic Retinopathy: A Review on Datasets, Methods and Evaluation Metrics',
     'Source': 'ieee access',
     'Year': 2020,
     'Link': 'Not found',
     'Abstract': "Diabetic retinopathy (DR) is a fast-spreading disease across the globe, which is caused by diabetes. The DR may lead the diabetic patients to complete vision loss. In this scenario, early identification of DR is more essential to recover the eyesight and provide help for timely treatment. The detection of DR can be manually 

In [122]:
# Number of douments related to each authors combination

In [120]:
query_body_2 =  {
    "query" : {
        "match_all" : {}                        
    },
    "size": 0,
    "aggs": {
    "group_by_state": {
      "terms": {
        "field": "Authors.keyword"
      }
    }
  }
}

In [121]:
es.search(index="docs_slr_old_mapping", body=query_body_2)

{'took': 3,
 'timed_out': False,
 '_shards': {'total': 2, 'successful': 2, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2823, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'group_by_state': {'doc_count_error_upper_bound': 4,
   'sum_other_doc_count': 2730,
   'buckets': [{'key': 'Not Found', 'doc_count': 44},
    {'key': 'M. Z. Khan; Y. Lee', 'doc_count': 6},
    {'key': 'S. Mohammadian; A. Karsaz; Y. M. Roshan', 'doc_count': 4},
    {'key': 'Bhardwaj C., Jain S., Sood M.', 'doc_count': 3},
    {'key': 'J. Wang; Y. Bai; B. Xia', 'doc_count': 3},
    {'key': 'Khan A,Uddin S,Srinivasan U', 'doc_count': 3},
    {'key': 'Nakandala S,Kumar A,Papakonstantinou Y', 'doc_count': 3},
    {'key': 'Ren H,Wang J,Zhao WX', 'doc_count': 3},
    {'key': 'Wang Z,Lin J,Wang R,Zheng W', 'doc_count': 3},
    {'key': 'Watson D', 'doc_count': 3}]}}}

# Most frequent words

In [123]:
query_body = {
    "size": 0,
    "aggs" : {
        "frequent_words" : {
            "terms" : { "field" : "Abstract.stem_noshingle" }
        }
    }
}

In [124]:
es.search(index="docs_slr_customized_mapping", body=query_body)

{'took': 325,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2823, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'frequent_words': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 238581,
   'buckets': [{'key': 'The', 'doc_count': 2059},
    {'key': 'learn', 'doc_count': 1765},
    {'key': 'imag', 'doc_count': 1630},
    {'key': 'diagnosi', 'doc_count': 1586},
    {'key': 'In', 'doc_count': 1575},
    {'key': 'model', 'doc_count': 1549},
    {'key': 'propos', 'doc_count': 1491},
    {'key': 'diabet', 'doc_count': 1478},
    {'key': 'base', 'doc_count': 1468},
    {'key': 'us', 'doc_count': 1419}]}}}