In [1]:
# Program, Python kullanarak elastik aramada her pdf dosyasının indeksini oluşturmaktır.

In [3]:
# Burada gerekli paketi içe aktardım
     #-> Dizin oluşturmak ve aramak için Elasticsearch
     #-> os ve glob dizini ayarlamak ve tüm pdf dosyaları dizinine erişim sağlamak için
     #-> PyPDF2, tüm pdf dosyalarını okumanın anahtarıdır
     #-> Ve Pandas, okunan metinden veri çerçevesi oluşturma paketidir.

In [4]:
from elasticsearch import Elasticsearch
import os
import glob
import PyPDF2
import pandas as pd

In [5]:
# karakter işlevi, okunan pdf dosyalarından veri çerçevesini oluşturur

In [6]:
os.chdir("./Books/")
files = glob.glob("*.*")

In [7]:
len(files)

12

In [8]:
for book in files:
    print(book)

ch1.pdf
data-mining-concepts-and-techniques-2nd-edition-impressao.pdf
Data-Mining.pdf
DSA_Book.pdf
NIC225296.pdf
Parallel computing.pdf
pyspark.pdf
s9449-building-a-distributed-gpu-dataframe-with-python_V2.pdf
s9577-rapids-the-platform-inside-and-out.pdf
The Confessions Of Saint Augustine.pdf
thebook.pdf
[Joel_Grus]_Data_Science_from_Scratch_First_Princ.pdf


In [9]:
# aşağıdaki işlev, okunan pdf dosyalarından veri çerçevesi oluşturur

In [10]:
def create_dataframe(files):
    this_loc = 1
    df = pd.DataFrame(columns=['Name',"Pages","Content"])
    
    for file in files:
        pdfFileObj = open(file,'rb')
        pdfreader = PyPDF2.PdfFileReader(pdfFileObj)
        n_pages = pdfreader.numPages
        this_doc = ''
        for i in range(n_pages):
            pageObj = pdfreader.getPage(i)
            this_doc += pageObj.extractText()
        df.loc[this_loc]= file, pdfreader.getNumPages(),this_doc
        this_loc = this_loc + 1
    return df

In [11]:
df = create_dataframe(files)



In [12]:
df.head()

Unnamed: 0,Name,Pages,Content
1,ch1.pdf,19,Chapter1\n\nDataMining\n\nInthisintoductorycha...
2,data-mining-concepts-and-techniques-2nd-editio...,770,TheMorganKaufmannSeriesinDataManagementSystems...
3,Data-Mining.pdf,24,Computer Science\nAbout the Book\n˜ is textboo...
4,DSA_Book.pdf,462,"SANJIVRANJANDAS\nDATASCIENCE:\nTHEORIES,\nMODE..."
5,NIC225296.pdf,830,PublicationSeriesoftheJohnvonNeumannInstitutef...


In [13]:
# Elasticsearch nesnesi oluşturma

In [14]:
es = Elasticsearch()

In [15]:
col_names=df.columns

In [16]:
# Veri çerçevesinde yineleyerek her dosya için dizin ekleme

In [17]:
for row_number in range(df.shape[0]):
    body = dict([(name, str(df.iloc[row_number][name])) for name in col_names])
    es.index(index="elsbooktrial",doc_type="books",body=body)

In [18]:
search_rslt = es.search(index="elsbooktrial",body={"_source":["Name","Pages"],
                                                  "query":{
                                                      "match_phrase":{"Content":"Computing"}
                                                  }})

In [19]:
search_rslt

{'took': 1549,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 14, 'relation': 'eq'},
  'max_score': 0.91169894,
  'hits': [{'_index': 'elsbooktrial',
    '_type': 'books',
    '_id': 'DkPl_3kBVMLjcXNPZoaI',
    '_score': 0.91169894,
    '_source': {'Pages': '186', 'Name': 'Parallel computing.pdf'}},
   {'_index': 'elsbooktrial',
    '_type': 'books',
    '_id': 'GUMaAXoBVMLjcXNPAoYT',
    '_score': 0.91169894,
    '_source': {'Pages': '186', 'Name': 'Parallel computing.pdf'}},
   {'_index': 'elsbooktrial',
    '_type': 'books',
    '_id': 'DUPl_3kBVMLjcXNPZoYJ',
    '_score': 0.82565004,
    '_source': {'Pages': '830', 'Name': 'NIC225296.pdf'}},
   {'_index': 'elsbooktrial',
    '_type': 'books',
    '_id': 'GEMaAXoBVMLjcXNPAYaQ',
    '_score': 0.82565004,
    '_source': {'Pages': '830', 'Name': 'NIC225296.pdf'}},
   {'_index': 'elsbooktrial',
    '_type': 'books',
    '_id': 'EkPl_3kBVMLjcXNPZ4YQ',
    '_score': 

In [35]:
from elasticsearch import Elasticsearch
from random import randint

In [36]:
class Book:
    def __init__(self, book_id: int, title: str, author: object, sub_title = None):
        self.book_id = book_id
        self.title = title,
        self.sub_title = sub_title
        self.author = author
        self.raw_text = None
        self.sentence_delimiter = '.'
        self.paragraph_delimiter = '\n\n\n'
        self.paragraphs = None
        self.indexed_paragraphs = []

    def load_raw_text(self):
        with open('The Confessions Of Saint Augustine.txt'.format(book=self.book_id)) as f:
            self.raw_text = f.read()

    def split_text_into_paragraphs(self):
        self.paragraphs = self.raw_text.split(self.paragraph_delimiter)
        self.raw_text = None

    def index_paragraphs(self):
        p_counter = 1
        for paragraph in self.paragraphs:
            self.indexed_paragraphs.append({"index": p_counter, "paragraph": paragraph})
            p_counter += 1
        self.paragraphs = None

    def split_paragraphs_into_sentences(self):
        s_counter = 1
        for paragraph in self.indexed_paragraphs:
            sentences = paragraph["paragraph"].split(self.sentence_delimiter)
            for sentence in sentences:
                s_counter += 1
                elastic_book_packet = self.create_data_packet(paragraph, s_counter, sentence)
            yield elastic_book_packet
        self.indexed_paragraphs = None

    def create_data_packet(self, paragraph, s_counter, sentence):
        return {"book_id": self.book_id,
                "author_id": self.author.author_id,
                "category": self.author.category,
                "chapter_id": 0,
                "paragraph": paragraph["index"],
                "sentence_id": s_counter,
                "sentence_text": sentence.replace('\n', '')}

In [37]:
class Author:
    def __init__(self, first_name: str, last_name: str, category: str, middle_name = None):
        self.first_name = first_name
        self.last_name = last_name
        self.middle_name = middle_name
        self.category = category
        self.author_id = randint(1,10000)

In [38]:
class ElasticSink:
    def __init__(self):
        try:
            self.client = Elasticsearch()
        except Exception as e:
            print('Sorry, problem trying to create Elasticsearch client.')
            exit(1)

    def index_document(self, data_packet: dict, index='books', doc_type='sentence'):
        unique_index_id = '{book_id}_{sentence_id}'.format(book_id=data_packet["book_id"],
                                                           sentence_id=data_packet["sentence_id"])
        try:
            response = self.client.index(index=index,
                                     doc_type=doc_type,
                                     body=data_packet,
                                     id=unique_index_id)
            print(response)
        except Exception as e:
            print(f'Something went wrong and I could not index.. {data_packet}')

    def search_for_word_match(self, word: str, index: str, field: str):
        result = self.client.search(index=index,body={'query':{'match':{field:word}}})
        for hit in result["hits"]:
            print(hit)

    def search_and_filter(self, index: str, field: str, word: str, author_id: str):
        result = self.client.search(index=index,
                                    body={
                                      "query": {
                                             "bool" : {
                                                  "must" : [{"term" : {field : word}},],
                                                  "filter": [{"term" : {"author_id" : author_id}}]
                                                  }
                                      }
                                    }

                                )
        for hit in result["hits"]:
            print(hit)

In [39]:
if __name__ == '__main__':
    a = Author(first_name='St.',
               last_name='Augustine',
               category='Early Church Father')
    b = Book(book_id=3296,
             title='The Confessions Of Saint Augustine',
             author=a)
    b.load_raw_text()
    b.split_text_into_paragraphs()
    b.index_paragraphs()
    packets = b.split_paragraphs_into_sentences()
    es = ElasticSink()
    for packet in packets:
        es.index_document(packet)
    es.search_for_word_match(word='faith',
                             index='books',
                             field='sentence_text')
    es.search_and_filter(word='faith',
                             index='books',
                             field='sentence_text',
                             author_id=1168)

FileNotFoundError: [Errno 2] No such file or directory: 'The Confessions Of Saint Augustine.txt'