# Synthesize a research paper
## Workflow
+ Define domain<br>
+ Build corpus<br>
+ Create embeddings model<br>
+ Perform WhiteSpace Analysis<br>
+ Identify best area for analysis<br>
+ Suggest topics for research<br>
+ Synthesize research paper content on selected topic

## Import Libraries

In [1]:
import os
import time

from arxiv import SortCriterion, SortOrder, Client, Search
import csv

import openai

import pdfplumber
import pytesseract

# Inputs

In [2]:
# Define domain
domain = "petroleum engineering"

In [3]:
# Model definitions
# openAI key
openai.api_key = ""
# for azure openAI
# your endpoint should look like the following https://YOUR_RESOURCE_NAME.openai.azure.com/
openai.api_base =  "" 
openai.api_version = '2022-12-01' # this may change in the future
openai.api_type = 'azure'
#This will correspond to the custom name you chose for your deployment when you deployed a model. 
deployment_name='' 

# misc definitions
corpus_directory = "./corpus"
max_papers = 100
train_filename = "train.txt"

## Helper functions

In [4]:
# create a function to generate key words for research in the defined domain
def generate_research_keywords_for_domain(domain:str)->str:
    prompt = "Create comma seperated key words for research queries in the "+domain+" domain:"
    response = openai.Completion.create(
            engine=deployment_name,
            prompt=prompt,
            temperature=0.20,
            max_tokens=500
        )
    result = response.choices[0].text.strip()
    return result

def string_to_list(text:str)->str:
    """
    Converts a string of comma-separated values into a list of individual elements.
    """
    # Split the string into individual values using the comma separator
    values = text.split(',')
    
    # Remove any whitespace from the values
    values = [value.strip().replace(".", "") for value in values]
    
    # Return the list of values
    return values

def create_directory_if_not_exists(path:str):
    """
    Creates a directory at the specified path if it does not already exist.
    """
    if not os.path.exists(path):
        os.makedirs(path)
    return

# create a function to save papers by removing special characters from the file name
def clean_title(title:str):
    title = title.replace(" ", "_")
    title = title.replace("\\", "")
    title = title.replace("/", "")
    return title

def file_exists(path:str):
    """
    Checks if a file exists at the specified path.
    Returns True if the file exists, False otherwise.
    """
    return os.path.exists(path) and os.path.isfile(path)

def get_typed_files_in_directory(directory:str, file_type:str) -> list:
    """
    Returns a list of all files in the specified directory.
    """
    # Initialize an empty list to hold the PDF files
    typed_files = []

    # Loop over all files in the directory
    for filename in os.listdir(directory):
        # Check if the file has a .pdf extension
        if filename.lower().endswith(f".{file_type.lower()}"):
            # If it's a PDF file, add the full path to the list
            typed_files.append(os.path.join(directory, filename))

    # Return the list of PDF files
    return typed_files

def write_or_append_file(filepath, text):
    """
    Writes the specified text to a file, creating the file if it does not exist,
    or appends the text to the end of the file if it already exists.
    """
    mode = "a" if os.path.exists(filepath) else "w"
    with open(filepath, mode) as f:
        f.write(text)
        
def extract_text_from_pdf(filepath:str)->str:
    """
    Extracts text from a PDF file, using OCR if the PDF contains images.
    """
    # Open the PDF file using pdfplumber
    with pdfplumber.open(filepath) as pdf:
        # Initialize an empty string to hold the extracted text
        text = ""

        # Loop over all pages in the PDF
        for page in pdf.pages:
            # Extract the page text using pdfplumber
            page_text = page.extract_text()

            # If the page text is None, the page may contain images, so use OCR to extract the text
            if page_text is None:
                # Convert the page image to grayscale and apply thresholding to remove noise
                page_image = page.to_image(resolution=150)
                page_image = page_image.convert('L')
                page_image = page_image.point(lambda x: 0 if x < 180 else 255, '1')

                # Use pytesseract to extract the text from the image
                page_text = pytesseract.image_to_string(page_image)

            # Add the page text to the overall text
            text += page_text

    # Return the extracted text
    return text

## Build corpus
Here we will build a corpus of documents from ARXIV papers<br>
To search papers, generate a list of keywords of research areas in the defined domain

In [5]:
keywords = generate_research_keywords_for_domain(domain)
keywords = string_to_list(keywords)

In [6]:
keywords

['petroleum engineering',
 'oil exploration',
 'drilling',
 'reservoir engineering',
 'production engineering',
 'well completion',
 'petroleum geology',
 'petroleum economics',
 'petroleum refining',
 'natural gas engineering',
 'offshore engineering',
 'pipeline engineering']

In [7]:
# download papers in each research area
create_directory_if_not_exists(corpus_directory)
for searchterm in keywords:
    # create arxiv client
    big_slow_client = Client(
      page_size = 1000,
      delay_seconds = 10,
      num_retries = 5
    )
    for result in big_slow_client.results(Search(query=searchterm, max_results = max_papers, sort_by = SortCriterion.Relevance, sort_order = SortOrder.Ascending)):
        filename = clean_title(result.title)+".pdf"
        filepath = os.path.join(corpus_directory, filename)
        try:
            if file_exists(filepath):
                print("downloaded paper: "+filepath)
            else:
                print("downloading paper: "+filepath)
                result.download_pdf(dirpath=corpus_directory, filename=filename)
                time.sleep(2)
        except:
            print("Error downloading paper: "+filename)
            time.sleep(10)
            continue

downloaded paper: ./corpus/Optimal_Economic_Operation_of_Liquid_Petroleum_Products_Pipeline_Systems.pdf
downloaded paper: ./corpus/Phase_behavior_in_petroleum_fluids_(A_Detailed_Descriptive_and_Illustrative_Account).pdf
downloaded paper: ./corpus/On_the_Determination_of_the_Solar_Rotation_Elements_i,_Ω_and_Period_using_Sunspot_Observations_by_Ruđer_Bošković_in_1777.pdf
downloaded paper: ./corpus/On_gray-box_modeling_for_virtual_flow_metering.pdf
downloaded paper: ./corpus/Reaction_Mechanisms_in_Petroleum:_From_Experimentation_to_Upgrading_and_Geological_Conditions.pdf
downloaded paper: ./corpus/Simulation_of_incompressible_two-phase_flow_in_porous_media_with_large_timesteps.pdf
downloaded paper: ./corpus/Multi-task_learning_for_virtual_flow_metering.pdf
downloaded paper: ./corpus/Forecasting_the_production_of_Distillate_Fuel_Oil_Refinery_and_Propane_Blender_net_production_by_using_Time_Series_Algorithms.pdf
downloaded paper: ./corpus/A_Decision_Support_System_for_Multi-target_Geosteeri

downloaded paper: ./corpus/Application_of_Probabilistic_Graphical_Models_in_Forecasting_Crude_Oil_Price.pdf
downloaded paper: ./corpus/Experimental_evaluation_of_a_silicone_oil_as_an_oxidation_inhibitor_for_magnesium_alloy_under_contact_sliding_at_elevated_temperatures.pdf
downloaded paper: ./corpus/Exploring_the_use_of_Transition_Path_Theory_in_building_an_oil_spill_prediction_scheme.pdf
downloaded paper: ./corpus/Exploration_of_Spanish_Olive_Oil_Quality_with_a_Miniaturized_Low-Cost_Fluorescence_Sensor_and_Machine_Learning_Techniques.pdf
downloaded paper: ./corpus/Olive_Oil_is_Made_of_Olives,_Baby_Oil_is_Made_for_Babies:_Interpreting_Noun_Compounds_using_Paraphrases_in_a_Neural_Model.pdf
downloaded paper: ./corpus/Stability_of_additive-free_water-in-oil_emulsions.pdf
downloaded paper: ./corpus/Pickering_emulsions_with_alpha-cyclodextrin_inclusions:_Structure_and_thermal_stability.pdf
downloaded paper: ./corpus/Physico-chemical_properties_extraction_from_the_fluorescence_spectrum_with_

downloaded paper: ./corpus/Forecasting_the_abnormal_events_at_well_drilling_with_machine_learning.pdf
downloaded paper: ./corpus/Violent_music_vs_violence_and_music:_Drill_rap_and_violent_crime_in_London.pdf
downloaded paper: ./corpus/DREAMS:_Drilling_and_Extraction_Automated_System.pdf
downloaded paper: ./corpus/On_the_characterization_of_drilling_rotation_in_the_6-parameter_resultant_shell_theory.pdf
downloaded paper: ./corpus/Effects_of_Beam_Size_and_Pulse_Duration_on_the_Laser_Drilling_Process.pdf
downloaded paper: ./corpus/Robot-Assisted_Drilling_on_Curved_Surfaces_with_Haptic_Guidance_under_Adaptive_Admittance_Control.pdf
downloaded paper: ./corpus/Detecting_Drill_Failure_in_the_Small_Short-sound_Drill_Dataset.pdf
downloaded paper: ./corpus/A_force-sensing_surgical_drill_for_real-time_force_feedback_in_robotic_mastoidectomy.pdf
downloaded paper: ./corpus/Interactive_Data_Exploration_with_Smart_Drill-Down.pdf
downloaded paper: ./corpus/Application_of_Machine_Learning_to_accidents_

downloaded paper: ./corpus/Maximal_Steered_Coherence_Protection_by_Quantum_Reservoir_Engineering.pdf
downloaded paper: ./corpus/Extracting_work_from_random_collisions:_A_model_of_a_quantum_heat_engine.pdf
downloaded paper: ./corpus/Designing_reservoirs_for_1t_decoherence_of_a_qubit.pdf
downloaded paper: ./corpus/Carnot's_theorem_for_nonthermal_stationary_reservoirs.pdf
downloaded paper: ./corpus/A_Micrometer-sized_Heat_Engine_Operating_Between_Bacterial_Reservoirs.pdf
downloaded paper: ./corpus/Efficiency_of_heat_engines_coupled_to_nonequilibrium_reservoirs.pdf
downloaded paper: ./corpus/Optimization_performance_of_quantum_Otto_heat_engines_and_refrigerators_with_squeezed_thermal_reservoirs.pdf
downloaded paper: ./corpus/Quantitative_supply_security_related_significance_measures_for_gas_reservoires.pdf
downloaded paper: ./corpus/Electromagnetically_Induced_Transparency_and_Quantum_Heat_Engines.pdf
downloaded paper: ./corpus/The_thermodynamics_governing_'endoreversible'_engines.pdf
down

downloaded paper: ./corpus/A_Business_Maturity_Model_of_Software_Product_Line_Engineering.pdf
downloaded paper: ./corpus/An_Architecture_Process_Maturity_Model_of_Software_Product_Line_Engineering.pdf
downloaded paper: ./corpus/Variability_and_Evolution_in_Systems_of_Systems.pdf
downloaded paper: ./corpus/Towards_a_Systems_Engineering_based_Automotive_Product_Engineering_Process.pdf
downloaded paper: ./corpus/Towards_a_Systems_Engineering_Essence.pdf
downloaded paper: ./corpus/Mind_the_Gap:_On_the_Relationship_Between_Automatically_Measured_and_Self-Reported_Productivity.pdf
downloaded paper: ./corpus/Resolving_code_smells_in_software_product_line_using_refactoring_and_reverse_engineering.pdf
downloaded paper: ./corpus/Fault-Tolerant_Dot-Product_Engines.pdf
downloaded paper: ./corpus/Data_Engineering_for_the_Analysis_of_Semiconductor_Manufacturing_Data.pdf
downloaded paper: ./corpus/Optimization_analysis_of_an_endoreversible_quantum_heat_engine_with_efficient_power_function.pdf
downloa

downloaded paper: ./corpus/Completions_of_Countable_Excellent_Domains_and_Countable_Noncatenary_Domains.pdf
downloaded paper: ./corpus/Generalized_existential_completions_and_their_regular_and_exact_completions.pdf
downloaded paper: ./corpus/Uniqueness_of_Instantaneously_Complete_Ricci_flows.pdf
downloaded paper: ./corpus/A_note_on_the_statistical_view_of_matrix_completion.pdf
downloaded paper: ./corpus/A_completeness_result_for_implicit_justification_stit_logic.pdf
downloaded paper: ./corpus/On_well-dominated_direct,_Cartesian_and_strong_product_graphs.pdf
downloaded paper: ./corpus/Dedekind_complete_and_order_continuous_Banach_$C(K)$-modules.pdf
downloaded paper: ./corpus/Smooth_prime_Fano_complete_intersections_in_toric_varieties.pdf
downloaded paper: ./corpus/Profinite_completions_and_MacNeille_completions_of_MV-algebras.pdf
downloaded paper: ./corpus/On_Stoltenberg's_quasi-uniform_completion.pdf
downloaded paper: ./corpus/Action_Completion:_A_Temporal_Model_for_Moment_Detection.pd

downloaded paper: ./corpus/Dynamic_and_Multi-functional_Labeling_Schemes.pdf
downloaded paper: ./corpus/Disciplined_Geometric_Programming.pdf
downloaded paper: ./corpus/Random_input_helps_searching_predecessors.pdf
downloaded paper: ./corpus/Large_gaps_between_primes.pdf
downloaded paper: ./corpus/Hyperbolicity_for_log_smooth_families_with_maximal_variation.pdf
downloaded paper: ./corpus/The_Log_Product_Formula.pdf
downloading paper: ./corpus/Higher_order_corrections_for_anisotropic_bootstrap_percolation.pdf
downloading paper: ./corpus/Near-Optimal_(Euclidean)_Metric_Compression.pdf
downloaded paper: ./corpus/Improved_Parallel_Construction_of_Wavelet_Trees_and_RankSelect_Structures.pdf
downloaded paper: ./corpus/On_the_Profile_of_Multiplicities_of_Complete_Subgraphs.pdf
downloaded paper: ./corpus/HyperLogLogLog:_Cardinality_Estimation_With_One_Log_More.pdf
downloaded paper: ./corpus/Anonymization_of_System_Logs_for_Privacy_and_Storage_Benefits.pdf
downloaded paper: ./corpus/Stringy_inv

downloading paper: ./corpus/FLAIM:_A_Multi-level_Anonymization_Framework_for_Computer_and_Network_Logs.pdf
Error downloading paper: FLAIM:_A_Multi-level_Anonymization_Framework_for_Computer_and_Network_Logs.pdf
downloading paper: ./corpus/Heights_and_metrics_with_logarithmic_singularities.pdf
downloading paper: ./corpus/Fast_and_Compact_Prefix_Codes.pdf
Error downloading paper: Fast_and_Compact_Prefix_Codes.pdf
downloading paper: ./corpus/A_strong_log-concavity_property_for_measures_on_Boolean_algebras.pdf
downloading paper: ./corpus/Mixed_f-divergence_and_inequalities_for_log_concave_functions.pdf
downloading paper: ./corpus/Log-optimal_portfolio_without_NFLVR:_existence,_complete_characterization,_and_duality.pdf
downloading paper: ./corpus/On_the_Liouville_function_in_short_intervals.pdf
downloading paper: ./corpus/Diagnosing_Distributed_Systems_through_Log_Data_Analysis.pdf
downloading paper: ./corpus/Improved_log-concavity_for_rotationally_invariant_measures_of_symmetric_convex_se

downloaded paper: ./corpus/New_Indivisible_Geoscience_Paradigm.pdf
downloaded paper: ./corpus/Is_the_dream_solution_to_the_continuum_hypothesis_attainable?.pdf
downloaded paper: ./corpus/Distributed_computing_of_Seismic_Imaging_Algorithms.pdf
downloaded paper: ./corpus/A_~60_Myr_periodicity_is_common_to_marine-87Sr86Sr,_fossil_biodiversity,_and_large-scale_sedimentation:_what_does_the_periodicity_reflect?.pdf
downloading paper: ./corpus/The_arrow_of_time_and_the_nature_of_spacetime.pdf
downloaded paper: ./corpus/Granular_Shear_Flow_in_Varying_Gravitational_Environments.pdf
downloaded paper: ./corpus/A_New_Basis_of_Geoscience:_Whole-Earth_Decompression_Dynamics.pdf
downloaded paper: ./corpus/Amplification_of_seismic_ground_motion_in_the_Tunis_basin:_Numerical_BEM_simulations_vs_experimental_evidences.pdf
downloaded paper: ./corpus/Modeling_complex_spatial_dynamics_of_two-population_interaction_in_urbanization_process.pdf
downloaded paper: ./corpus/Inferring_network_topology_via_the_prop

downloading paper: ./corpus/Seismic_performance_of_an_infilled_moment-resisting_steel_frame_during_the_2016_Central_Italy_Earthquake.pdf
downloading paper: ./corpus/Subsurface_Depths_Structure_Maps_Reconstruction_with_Generative_Adversarial_Networks.pdf
downloading paper: ./corpus/De-risking_geological_carbon_storage_from_high_resolution_time-lapse_seismic_to_explainable_leakage_detection.pdf
downloading paper: ./corpus/Limits_of_the_seismogenic_zone_in_the_epicentral_region_of_the_26_December_2004_great_Sumatra-Andaman_earthquake:_Results_from_seismic_refraction_and_wide-angle_reflection_surveys_and_thermal_modeling.pdf
downloading paper: ./corpus/The_CoRoT-GES_Collaboration._Improving_Red_Giants_spectroscopic_surface_gravity_and_abundances_with_asteroseismology.pdf
downloading paper: ./corpus/Energy-Efficient_mm-Wave_Backhauling_via_Frame_Aggregation_in_Wide_Area_Networks.pdf
downloading paper: ./corpus/Seismic_Facies_Analysis:_A_Deep_Domain_Adaptation_Approach.pdf
downloading paper:

downloading paper: ./corpus/Reducing_US_Biofuels_Requirements_Mitigates_Short-term_Impacts_of_Global_Population_and_Income_Growth_on_Agricultural_Environmental_Outcomes.pdf
downloaded paper: ./corpus/Predicting_crashes_in_oil_prices_during_the_COVID-19_pandemic_with_mixed_causal-noncausal_models.pdf
downloaded paper: ./corpus/A_mathematical_model_for_planning_oil_products_distribution_via_pipeline.pdf
downloaded paper: ./corpus/Well_Cement_Degradation_and_Wellbore_Integrity_in_Geological_CO2_Storages:_A_Literature_Review.pdf
downloaded paper: ./corpus/Machine_Learning-Based_Soft_Sensors_for_Vacuum_Distillation_Unit.pdf
downloaded paper: ./corpus/The_Non-Organic_Theory_of_the_Genesis_of_Petroleum.pdf
downloaded paper: ./corpus/Impact_of_Recent_Discoveries_on_Petroleum_and_Natural_Gas_Exploration:_Emphasis_on_India.pdf
downloaded paper: ./corpus/Petroleum_prices_prediction_using_data_mining_techniques_--_A_Review.pdf
downloaded paper: ./corpus/Modeling_Electrical_Daily_Demand_in_Presence

Error downloading paper: A_global_economic_policy_uncertainty_index_from_principal_component_analysis.pdf
downloaded paper: ./corpus/Eco_-_No(?)_-_Physics_-_comments_and_reflexions_-.pdf
downloaded paper: ./corpus/Heisenberg_uncertainty_principle_and_economic_analogues_of_basic_physical_quantities.pdf
downloaded paper: ./corpus/The_Effects_of_Taxes_on_Wealth_Inequality_in_Artificial_Chemistry_Models_of_Economic_Activity.pdf
downloaded paper: ./corpus/Eurasian_Economic_Union:_Current_Concept_and_Prospects.pdf
downloaded paper: ./corpus/Management_of_Social_and_Economic_Development_of_Municipalities.pdf
downloaded paper: ./corpus/Infinite_Growth:_A_Curse_or_a_Blessing?.pdf
downloaded paper: ./corpus/Knowledge_Management_in_Management_of_Social_and_Economic_Development_of_Municipalities:_Highlights.pdf
downloaded paper: ./corpus/Human_Reliability_Analysis_for_Oil_and_Gas_Operations:_Analysis_of_Existing_Methods.pdf
downloaded paper: ./corpus/Long-Term_Optimal_Delivery_Planning_for_Replaci

Error downloading paper: Three-dimensional_sand_ripples_as_the_product_of_vortex_instability.pdf
downloaded paper: ./corpus/Iterative_interferometry-based_method_for_picking_microseismic_events.pdf
downloaded paper: ./corpus/Critical_bursts_in_filtration.pdf
downloaded paper: ./corpus/Night_sky_brightness_above_Zagreb_2012.-2017.pdf
downloaded paper: ./corpus/Should_Engineers_be_Concerned_about_Vulnerability_of_Highway_Bridges_to_Potentially-Induced_Seismic_Hazards?.pdf
downloaded paper: ./corpus/Support_Vector_Machine_Application_for_Multiphase_Flow_Pattern_Prediction.pdf
downloaded paper: ./corpus/Asphaltene_aggregation_due_to_waterflooding_(A_molecular_dynamics_study).pdf
downloaded paper: ./corpus/Microfluidic_study_of_effects_of_flow_velocity_and_nutrient_concentration_on_biofilm_accumulation_and_adhesive_strength_in_a_microchannel.pdf
downloaded paper: ./corpus/Tensor-based_subspace_learning_for_tracking_salt-dome_boundaries.pdf
downloaded paper: ./corpus/Hydrocarbons_under_press

downloaded paper: ./corpus/The_Non-Organic_Theory_of_the_Genesis_of_Petroleum.pdf
downloaded paper: ./corpus/On_the_Determination_of_the_Solar_Rotation_Elements_i,_Ω_and_Period_using_Sunspot_Observations_by_Ruđer_Bošković_in_1777.pdf
downloaded paper: ./corpus/Distributed_computing_of_Seismic_Imaging_Algorithms.pdf
downloaded paper: ./corpus/Industrial_applications_of_digital_rock_technology.pdf
downloaded paper: ./corpus/Phase_behavior_in_petroleum_fluids_(A_Detailed_Descriptive_and_Illustrative_Account).pdf
downloaded paper: ./corpus/The_application_of_high-resolution_3D_seismic_data_to_model_the_distribution_of_mechanical_and_hydrogeological_properties_of_a_potential_host_rock_for_the_deep_storage_of_radioactive_waste_in_France.pdf
downloading paper: ./corpus/Multiresolution_Coupled_Vertical_Equilibrium_Model_for_Fast_Flexible_Simulation_of_CO$_2$_Storage.pdf
Error downloading paper: Multiresolution_Coupled_Vertical_Equilibrium_Model_for_Fast_Flexible_Simulation_of_CO$_2$_Storage.pd

downloaded paper: ./corpus/Robust_Oil-spill_Forensics_and_Petroleum_Source_Differentiation_using_Quantized_Peak_Topography_Maps.pdf
downloaded paper: ./corpus/How_to_use_Big_Data_technologies_to_optimize_operations_in_Upstream_Petroleum_Industry.pdf
downloaded paper: ./corpus/Phase_behavior_in_petroleum_fluids_(A_Detailed_Descriptive_and_Illustrative_Account).pdf
downloaded paper: ./corpus/Multi-task_learning_for_virtual_flow_metering.pdf
downloaded paper: ./corpus/Vortex_flow_in_the_technology_of_radiation_wave_cracking_(RWC).pdf
downloaded paper: ./corpus/User_Association_in_User-Centric_Hybrid_VLCRF_Cell-Free_Massive_MIMO_Systems.pdf
downloaded paper: ./corpus/Industrial_applications_of_digital_rock_technology.pdf
downloaded paper: ./corpus/The_Non-Organic_Theory_of_the_Genesis_of_Petroleum.pdf
downloaded paper: ./corpus/Impact_of_Recent_Discoveries_on_Petroleum_and_Natural_Gas_Exploration:_Emphasis_on_India.pdf
downloaded paper: ./corpus/Petroleum_prices_prediction_using_data_minin

Error downloading paper: Three-dimensional_sand_ripples_as_the_product_of_vortex_instability.pdf
downloaded paper: ./corpus/Iterative_interferometry-based_method_for_picking_microseismic_events.pdf
downloaded paper: ./corpus/Critical_bursts_in_filtration.pdf
downloaded paper: ./corpus/Night_sky_brightness_above_Zagreb_2012.-2017.pdf
downloaded paper: ./corpus/Should_Engineers_be_Concerned_about_Vulnerability_of_Highway_Bridges_to_Potentially-Induced_Seismic_Hazards?.pdf
downloaded paper: ./corpus/Support_Vector_Machine_Application_for_Multiphase_Flow_Pattern_Prediction.pdf
downloaded paper: ./corpus/Asphaltene_aggregation_due_to_waterflooding_(A_molecular_dynamics_study).pdf
downloaded paper: ./corpus/Microfluidic_study_of_effects_of_flow_velocity_and_nutrient_concentration_on_biofilm_accumulation_and_adhesive_strength_in_a_microchannel.pdf
downloaded paper: ./corpus/Tensor-based_subspace_learning_for_tracking_salt-dome_boundaries.pdf
downloaded paper: ./corpus/Hydrocarbons_under_press

In [None]:
# get a list of files downloaded to the directory to process
pdf_file_list = get_typed_files_in_directory(corpus_directory, "pdf")
# extract the text from the pdf files
for file in pdf_file_list:
    try:
        print(f"processing file {file}")
        text = extract_text_from_pdf(file)
        if text is not None and len(text) > 10:
            write_or_append_file(train_filename, text)
        else:
            print(f"non text extract from file {file}")
    except:
        print(f"error with text extract from file {file}")

processing file ./corpus/Three-level_laser_heat_engine_at_optimal_performance_with_ecological_function.pdf
processing file ./corpus/Convergence_and_completeness_for_square-well_Stark_resonant_state_expansions.pdf
processing file ./corpus/Cellular_Memristive-Output_Reservoir_(CMOR).pdf
processing file ./corpus/A_new_design_strategy_based_on_a_deterministic_definition_of_the_seismic_input_to_overcome_the_limits_of_design_procedures_based_on_probabilistic_approaches.pdf
processing file ./corpus/Distributed_Kerr_Nonlinearity_in_a_Coherent_All-Optical_Fiber-Ring_Reservoir_Computer.pdf
processing file ./corpus/Completion_Time_in_Multi-Access_Channel:_An_Information_Theoretic_Perspective.pdf
processing file ./corpus/Noncommutative_Schur-type_products_and_their_Schoenberg_theorem.pdf
processing file ./corpus/Agile_Software_Engineering_and_Systems_Engineering_at_SKA_Scale.pdf
processing file ./corpus/Oil_and_water:_a_two-type_internal_aggregation_model.pdf
processing file ./corpus/Conductive_He

processing file ./corpus/Variations_of_petrophysical_properties_and_spectral_induced_polarization_in_response_to_drainage_and_imbibition:_a_study_on_a_correlated_random_tube_network.pdf
processing file ./corpus/Optimal_control_of_non-Markovian_open_quantum_systems_via_feedback.pdf
processing file ./corpus/Relaxation-limited_electronic_currents_in_extended_reservoir_simulations.pdf
processing file ./corpus/Machine_Learning_Guided_3D_Image_Recognition_for_Carbonate_Pore_and_Mineral_Volumes_Determination.pdf
processing file ./corpus/Natural_Connections_on_Riemannian_Product_Manifolds.pdf
processing file ./corpus/Large-scale_Reservoir_Simulations_on_IBM_Blue_GeneQ.pdf
processing file ./corpus/Academic_Search_Engines:_Constraints,_Bugs,_and_Recommendation.pdf
processing file ./corpus/AB_Space_Engine.pdf
processing file ./corpus/DREAMS:_Drilling_and_Extraction_Automated_System.pdf
processing file ./corpus/Complete_$λ$-surfaces_in_$mathbb_R^3$.pdf
processing file ./corpus/A_new_quality_preser

processing file ./corpus/Controlling_decoherence_speed_limit_of_a_single_impurity_atom_in_a_Bose-Einstein-condensate_reservoir.pdf
processing file ./corpus/Well_Cement_Degradation_and_Wellbore_Integrity_in_Geological_CO2_Storages:_A_Literature_Review.pdf
processing file ./corpus/4D_Seismic_History_Matching_Incorporating_Unsupervised_Learning.pdf
processing file ./corpus/Semantic_Curiosity_for_Active_Visual_Learning.pdf
processing file ./corpus/A_thermodynamic_parallel_of_the_Braess_road-network_paradox.pdf
processing file ./corpus/Viscous_and_Gravitational_Fingering_in_Multiphase_Compositional_and_Compressible_Flow.pdf
processing file ./corpus/Reservoir_computing_model_of_two-dimensional_turbulent_convection.pdf
processing file ./corpus/Smooth_prime_Fano_complete_intersections_in_toric_varieties.pdf
processing file ./corpus/Capillary_Flow_of_Oil_in_a_Single_Foam_Microchannel.pdf
processing file ./corpus/Deep_learning_for_Stock_Market_Prediction.pdf
processing file ./corpus/Gas_Flow_and

processing file ./corpus/Redeeming_Intrinsic_Rewards_via_Constrained_Optimization.pdf
processing file ./corpus/Query_Log_Compression_for_Workload_Analytics.pdf
processing file ./corpus/Implicit_Generative_Modeling_for_Efficient_Exploration.pdf
processing file ./corpus/On_well-dominated_direct,_Cartesian_and_strong_product_graphs.pdf
processing file ./corpus/Privacy_Engineering_Meets_Software_Engineering._On_the_Challenges_of_Engineering_Privacy_ByDesign.pdf
processing file ./corpus/Top-down_Paradigm_in_Engineering_Software_Integration.pdf
processing file ./corpus/Seismic_Wave_Equations_in_Tight_OilGas_Sandstone_Media.pdf
processing file ./corpus/Toward_a_Consistent_Framework_for_High_Order_Mesh_Refinement_Schemes_in_Numerical_Relativity.pdf
processing file ./corpus/Deep_Time-Delay_Reservoir_Computing:_Dynamics_and_Memory_Capacity.pdf
processing file ./corpus/Complete_positivity_of_the_map_from_a_basis_to_its_dual_basis.pdf
processing file ./corpus/Floquet_topological_systems_in_the_vic

processing file ./corpus/Exact_propagation_of_open_quantum_systems_in_a_system-reservoir_context.pdf
processing file ./corpus/A_model_for_proppant_dynamics_in_a_perforated_wellbore.pdf
processing file ./corpus/Fast_Automatic_Detection_of_Geological_Boundaries_from_Multivariate_Log_Data_Using_Recurrence.pdf
processing file ./corpus/Robust_joint_full-waveform_inversion_of_time-lapse_seismic_data_sets_with_total-variation_regularization.pdf
processing file ./corpus/What_Decreases_Editing_Capability?_Domain-Specific_Hybrid_Refinement_for_Improved_GAN_Inversion.pdf
processing file ./corpus/Complete_internal_categories.pdf
processing file ./corpus/Influence_of_pore_pressure_to_the_development_of_a_hydraulic_fracture_in_poroelastic_medium.pdf
processing file ./corpus/Sign_of_the_Casimir-Polder_interaction_between_atoms_and_oil-water_interfaces:_Subtle_dependence_on_dielectric_properties.pdf
processing file ./corpus/Time-variant_Seismic_Resilience_Analysis_Model_for_Water_Distribution_Systems.

processing file ./corpus/Effect_of_Pipelining_and_Multiplexing_in_Estimating_HTTP2.0_Web_Object_Sizes.pdf
processing file ./corpus/Requirements_Engineering,_Software_Testing_and_Education:_A_Systematic_Mapping.pdf
processing file ./corpus/On_the_upstream_mobility_scheme_for_two-phase_flow_in_porous_media.pdf
processing file ./corpus/Designing_an_Optimal_Portfolio_for_Iran's_Stock_Market_with_Genetic_Algorithm_using_Neural_Network_Prediction_of_Risk_and_Return_Stocks.pdf
processing file ./corpus/3D_Curvature_Analysis_of_Seismic_Waveforms_and_its_Interpretational_Implications.pdf
processing file ./corpus/Splitting_droplet_through_coalescence_of_two_different_three-phase_contact_lines.pdf
processing file ./corpus/Go-Explore:_a_New_Approach_for_Hard-Exploration_Problems.pdf
processing file ./corpus/Region_Refinement_Network_for_Salient_Object_Detection.pdf
processing file ./corpus/Statistics_of_Local_Seismic_Emission_from_the_Solar_Granulation.pdf
processing file ./corpus/A_Three-Field_bas