In [None]:
#!/usr/bin/env python3 
# -*- coding: utf-8 -*-

# Standard
import traceback

# 3rd-Party
import fitz
import pandas as pd

from libgen_api import LibgenSearch
from loguru import logger
from IPython.display import clear_output
from rich.console import Console
from rich.table import Table


In [None]:
console = Console(color_system = 'auto')

def create_libgen_table() -> Table:
    t = Table(title="Libgen Search Results", title_justify = "center")
    t.add_column("Index Number", justify="center", style="cyan")
    t.add_column("Book Name", justify="center", style="cyan")
    t.add_column("Author", justify="center", style="cyan")
    return t


In [None]:
@logger.catch
def scrape_books(exact_match: bool = True) -> pd.DataFrame():
    """Simple Function to Search LibGen Books / Articles Interactively

    Args:
        exact_match (bool, optional): Filter exact matches on LibGen search. 
            -> Defaults to True)->pd.DataFrame(.

    Returns:
        pd.DataFrame(): Pandas Dataframe of Chosen Books/Articles
    """
    s = LibgenSearch()
    df = pd.DataFrame(
        columns = [
            'ID', 'Author', 'Title', 'Publisher', 'Year', 'Pages', 'Language', 'Size', 
            'Extension', 'Mirror_1', 'Mirror_2',  'Mirror_3', 'Mirror_4', 'Mirror_5', 'Edit'
        ]
    )
    try: 
        while True:
            # Check for whitelisted commands
            search_input = input('How do you want to search? (author, title, both, exit)').lower()
            if search_input in ['author', 'title', 'both', 'exit']:
                
                # Initialize Results
                results = None
            
                # Standalone Author Search
                if search_input == 'author':
                    logger.info("[+] - Author Search Selected!")
                    author = input('Author Name:')
                    while len(author) < 3:
                        logger.error("[!] - Author Name must be >= 3 Characters!")
                        author = input('Author Name:')

                    results = s.search_author(author)

                # Standalone Title Search
                elif search_input == 'title':
                    logger.info("[+] - Title Search Selected!")
                    title = input('Title Name:')
                    while len(title) < 3:
                        logger.error("[!] - Title Name must be >= 3 Characters!")
                        title = input('Title Name:')

                    results = s.search_title(title)

                # Joint Author & Title Search
                elif search_input == 'both':
                    logger.info("[+] - Author & Title Search Selected!")
                    author = input('Author Name:')
                    while len(author) < 3:
                        logger.error("[!] - Author Name must be >= 3 Characters!")
                        author = input('Author Name:')

                    title = input('Title Name:')
                    while len(title) < 3:
                        logger.error("[!] - Title Name must be >= 3 Characters!")
                        title = input('Title Name:')
    
                    filters = {
                        'Author': author, 
                        "Extension": "pdf"
                    }

                    results = s.search_title_filtered(title, filters, exact_match)

                # Quit Program
                elif search_input == 'exit':
                    logger.info("[+] - Halting Program Execution!")
                    break

                # Do nothing if 0 search results come back
                if len(results) == 0:
                    logger.info("[+] - No Books Found Using the Provided Search Query!")

                else:
                    
                    # Display Search Results
                    table = create_libgen_table()
                    for index, result in enumerate(results):
                        table.add_row(str(index), result['Title'], result['Author'])
                    console.print(table)
                    
                    # Pick a specific book from the results
                    while True:
                        try:
                            selection = int(input('Enter index number of version you want to keep (-1 to select none)'))
                            
                            if selection == -1:
                                logger.info("[+] - No books were chosen from the table provided!")
                                break

                            elif 0 <= selection <= len(results) - 1:
                                logger.info(f"[+] - Book '{results[selection]['Title']}' was chosen!")
                                df = df.append(results[selection], ignore_index = True)
                                break

                        except ValueError as e:
                            logger.error(f"[!] - Error With Book Selection: {e}")
                            
                        except IndexError as e:
                            logger.error(f"[!] - Error With Book Selection: {e}")
                    
                    # Check to see if more books should be added
                    end_case = input("Continue Finding Books? ('y/n')").lower()  
                    while end_case not in ['y', 'n']:
                        end_case = input('Please type a valid response (y/n) ')

                    if end_case == 'y':
                        clear_output()
                        logger.info("[+] - Continue Book Searching Process!")
                        continue
                
                    elif end_case == 'n':
                        logger.info("[+] - Stop Book Searching Process!")
                        break

            # Invalid commands          
            else:
                logger.error("[!] - Not Valid Search Input!")
            
        return df
        
    # Capture a general exception and return dataframe
    except Exception as e: 
        logger.error(f"[!] - General Exception: {traceback.format_exc()}")
        return df

In [None]:
def scrape_book_text(pdf_list, df, index_list):
    x = 0
    while x < len(pdf_list):
        def create_book_df(filepath: str) -> pd.DataFrame():
            """Extracts text from a pdf and puts it into a dataframe
                (assuming its a book)

            Args:
                filepath (str): Path to file

            Returns:
                [pd.Dataframe()]: Pandas Dataframe of the book:
                                    (page_number, page_text)
            """
            try:
                with fitz.open(filepath) as doc:
                    page_texts = [
                        (page_num + 1, page.get_text()) 
                        for page_num, page in enumerate(doc) if page.get_text() not in ["", None]
                    ]
                    return pd.DataFrame(page_texts, columns = ['Page Number', "Page Text"])
            except Exception as e:
                pass
        book = create_book_df('book_pdf/' + pdf_list[x] + '.pdf')
        book = ''.join(book['Page Text'])
        df['Title_text'][index_list[x]] = book 
        x+=1
    return df

In [None]:
# This runs the libgen scraper
df = scrape_books(exact_match=False)

In [None]:
# Print this to show the dataframe collected after running `scrape_books()`
df

In [None]:
# Use these lists if you are adding more than one book
pdf_list = ['Culture and imperialism by Edward W. Said (z-lib.org)',
           'Black Skin, White Masks by Frantz Fanon (z-lib.org)',
           'The location of culture by Homi K. Bhabha (z-lib.org)',
           'Black Athena The Afroasiatic Roots of Classical Civilization The Linguistic Evidence, Vol. 3 by Martin Bernal (z-lib.org)',
           'The World, the Text, and the Critic by Edward W. Said (z-lib.org)',
           'Bruce Robbins - Perpetual War_ Cosmopolitanism from the Viewpoint of Violence-Duke University Press (2012)']
index_list = [0,3,4,5,6,7]

In [None]:
df = scrape_book_text(pdf_list, df, index_list)

In [None]:
df

In [None]:
df.to_csv('book_dataframes/post_colonial_books.csv')