# Synthesize a research paper
## Workflow
+ Define domain<br>
+ Build corpus<br>
+ Create embeddings model<br>
+ Perform WhiteSpace Analysis<br>
+ Identify best area for analysis<br>
+ Suggest topics for research<br>
+ Synthesize research paper content on selected topic

## Import Libraries

In [12]:
import os
import re
import string
import time

from arxiv import SortCriterion, SortOrder, Client, Search
import csv

import openai

import pdfplumber
import pytesseract
import fitz #pip install pymupdf
from PIL import Image
import io
import cv2
import numpy as np

from gensim.models.fasttext import FastText
import fasttext.util

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer

from sklearn.decomposition import PCA
# pip install umap-learn
import umap

import datashader as ds
import datashader.transfer_functions as tf
from datashader.bokeh_ext import InteractiveImage
from bokeh.plotting import figure, output_notebook, show
import pandas as pd

# Inputs

In [2]:
# Define domain
domain = "petroleum engineering"
# misc definitions
corpus_directory = "./corpus"
dimensions = 100
max_papers = 100

In [3]:
 # Azure Instance
openai.api_key = ""
# your endpoint should look like the following https://YOUR_RESOURCE_NAME.openai.azure.com/
openai.api_base =  "" 
openai.api_version = '2022-12-01' # this may change in the future
openai.api_type = 'azure'
#This will correspond to the custom name you chose for your deployment when you deployed a model. 
deployment_name='' 

In [4]:
# Set the path to the tesseract executable
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract' 
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
stemmer = WordNetLemmatizer()
# required data and model downloads
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
en_stop = set(nltk.corpus.stopwords.words('english'))

# define model training parameters
# the number of dimensions is the number of vectors to create that represent a word
embedding_size = dimensions
window_size = 40
min_word = 3
down_sampling = 1e-2
min_ngrams = 3
max_ngrams = 6
enrich_vectors = 1

train_filename = domain.replace(" ", "_")+"_train.txt"
VECTORS_FILEPATH = "cc.en."+str(dimensions)+".vec"
MODEL_NAME = domain.replace(" ", "_")+"_model.bin"
PRETRAINED_MODEL = "cc.en."+str(dimensions)+".bin"

[nltk_data] Downloading package wordnet to /home/titan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/titan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/titan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/titan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/titan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/titan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Helper functions

In [5]:
# create a function to generate key words for research in the defined domain
def generate_research_keywords_for_domain(domain:str)->str:
    prompt = "Create comma seperated key words for research queries in the "+domain+" domain:"
    response = openai.Completion.create(
            engine=deployment_name,
            prompt=prompt,
            temperature=0.20,
            max_tokens=500
        )
    result = response.choices[0].text.strip()
    return result

def string_to_list(text:str)->str:
    """
    Converts a string of comma-separated values into a list of individual elements.
    """
    # Split the string into individual values using the comma separator
    values = text.split(',')
    
    # Remove any whitespace from the values
    values = [value.strip().replace(".", "") for value in values]
    
    # Return the list of values
    return values

def create_directory_if_not_exists(path:str):
    """
    Creates a directory at the specified path if it does not already exist.
    """
    if not os.path.exists(path):
        os.makedirs(path)
    return

# create a function to save papers by removing special characters from the file name
def clean_title(title:str):
    title = title.replace(" ", "_")
    title = title.replace("\\", "")
    title = title.replace("/", "")
    return title

def file_exists(path:str):
    """
    Checks if a file exists at the specified path.
    Returns True if the file exists, False otherwise.
    """
    return os.path.exists(path) and os.path.isfile(path)

def get_typed_files_in_directory(directory:str, file_type:str) -> list:
    """
    Returns a list of all files in the specified directory.
    """
    # Initialize an empty list to hold the PDF files
    typed_files = []

    # Loop over all files in the directory
    for filename in os.listdir(directory):
        # Check if the file has a .pdf extension
        if filename.lower().endswith(f".{file_type.lower()}"):
            # If it's a PDF file, add the full path to the list
            typed_files.append(os.path.join(directory, filename))

    # Return the list of PDF files
    return typed_files

def write_or_append_file(filepath, text):
    """
    Writes the specified text to a file, creating the file if it does not exist,
    or appends the text to the end of the file if it already exists.
    """
    mode = "a" if os.path.exists(filepath) else "w"
    with open(filepath, mode) as f:
        f.write(text)
        
def extract_text_from_pdf(filepath:str)->str:
    """
    Extracts text from a PDF file, using OCR if the PDF contains images.
    """
    # Open the PDF file using pdfplumber
    with pdfplumber.open(filepath) as pdf:
        # Initialize an empty string to hold the extracted text
        text = ""

        # Loop over all pages in the PDF
        for page in pdf.pages:
            # Extract the page text using pdfplumber
            page_text = page.extract_text()

            # If the page text is None, the page may contain images, so use OCR to extract the text
            if page_text is None:
                # Convert the page image to grayscale and apply thresholding to remove noise
                page_image = page.to_image(resolution=150)
                page_image = page_image.convert('L')
                page_image = page_image.point(lambda x: 0 if x < 180 else 255, '1')

                # Use pytesseract to extract the text from the image
                page_text = pytesseract.image_to_string(page_image)

            # Add the page text to the overall text
            text += page_text

    # Return the extracted text
    return text

def rotate_image(image, angle):
    """
    Rotate an image by a given angle.

    Args:
        image (np.array): The image to rotate.
        angle (float): The angle to rotate the image by.

    Returns:
        np.array: The rotated image.
    """
    rows, cols = image.shape[:2]
    rotation_matrix = cv2.getRotationMatrix2D((cols / 2, rows / 2), angle, 1)
    return cv2.warpAffine(image, rotation_matrix, (cols, rows))

def correct_skew(image):
    """
    Correct skew in an image.

    Args:
        image (np.array): The image to correct skew for.

    Returns:
        np.array: The image with corrected skew.
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.bitwise_not(gray)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

    coords = np.column_stack(np.where(thresh > 0))
    angle = cv2.minAreaRect(coords)[-1]

    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle

    return rotate_image(image, angle)

def extract_text_from_pdf(pdf_path):
    """
    Extract text from an image-based PDF using the pymupdf library.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        str: The extracted text from the PDF.
    """
    # Open the PDF file
    doc = fitz.open(pdf_path)

    # Initialize an empty string to store the extracted text
    extracted_text = ""

    # Loop through each page in the PDF
    for page in doc:
        # Extract text from the page and append it to the extracted_text string
        extracted_text += page.get_text("text")

    # Close the PDF file
    doc.close()

    return extracted_text

def preprocess_text(document):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))
        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)
        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)
        # standardize white space
        document = re.sub(r'\s+', ' ', document)
        # Removing linebreaks '\n'
        document = re.sub(r'[\r\n]+', '', document)
        
        ## remove digits with regular expression
        document = re.sub(r'\d', ' ', document)
        #MM remove any patterns matching standard url format
        url_pattern = r'((http|ftp|https):\/\/)?[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?'
        document = re.sub(url_pattern, ' ', document)
        #MM remove all non-ascii characters
        document = ''.join(character for character in document if ord(character)<128)
        #MM filter funny characters, if any.
        printable = set(string.printable)
        document = filter(lambda x: x in printable, document) #filter funny characters, if any.
        document = "".join(list(document))
        

        # Converting to Lowercase
        document = document.lower().translate(remove_punct_dict)
        
        

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        #Remove all short words that have a length < 3 character
        tokens = [word for word in tokens if len(word) > 3]
        # remove remaining tokens that are not alphabetic
        tokens = [word for word in tokens if word.isalpha()]

        preprocessed_text = ' '.join(tokens)

        return preprocessed_text
    
def loadModel(modelName):
    model = FastText.load(modelName)
    return model

## Build corpus
Here we will build a corpus of documents from ARXIV papers<br>
To search papers, generate a list of keywords of research areas in the defined domain

In [6]:
keywords = generate_research_keywords_for_domain(domain)
keywords = string_to_list(keywords)

In [7]:
keywords

['petroleum engineering',
 'oil exploration',
 'drilling',
 'reservoir engineering',
 'production engineering',
 'well completion',
 'well logging',
 'geology',
 'seismic surveying',
 'petroleum economics',
 'petroleum refining',
 'petroleum geology',
 'petroleum technology',
 'petroleum law',
 'petroleum safety',
 'petroleum management']

In [8]:
# download papers in each research area
create_directory_if_not_exists(corpus_directory)
for searchterm in keywords:
    # create arxiv client
    big_slow_client = Client(
      page_size = 1000,
      delay_seconds = 10,
      num_retries = 5
    )
    for result in big_slow_client.results(Search(query=searchterm, max_results = max_papers, sort_by = SortCriterion.Relevance, sort_order = SortOrder.Ascending)):
        filename = clean_title(result.title)+".pdf"
        filepath = os.path.join(corpus_directory, filename)
        try:
            if file_exists(filepath):
                print("downloaded paper: "+filepath)
            else:
                print("downloading paper: "+filepath)
                result.download_pdf(dirpath=corpus_directory, filename=filename)
                time.sleep(2)
        except:
            print("Error downloading paper: "+filename)
            time.sleep(10)
            continue

downloaded paper: ./corpus/Optimal_Economic_Operation_of_Liquid_Petroleum_Products_Pipeline_Systems.pdf
downloaded paper: ./corpus/Phase_behavior_in_petroleum_fluids_(A_Detailed_Descriptive_and_Illustrative_Account).pdf
downloaded paper: ./corpus/On_the_Determination_of_the_Solar_Rotation_Elements_i,_Ω_and_Period_using_Sunspot_Observations_by_Ruđer_Bošković_in_1777.pdf
downloaded paper: ./corpus/On_gray-box_modeling_for_virtual_flow_metering.pdf
downloaded paper: ./corpus/Reaction_Mechanisms_in_Petroleum:_From_Experimentation_to_Upgrading_and_Geological_Conditions.pdf
downloaded paper: ./corpus/Simulation_of_incompressible_two-phase_flow_in_porous_media_with_large_timesteps.pdf
downloaded paper: ./corpus/Multi-task_learning_for_virtual_flow_metering.pdf
downloaded paper: ./corpus/Forecasting_the_production_of_Distillate_Fuel_Oil_Refinery_and_Propane_Blender_net_production_by_using_Time_Series_Algorithms.pdf
downloaded paper: ./corpus/A_Decision_Support_System_for_Multi-target_Geosteeri

downloaded paper: ./corpus/Application_of_Probabilistic_Graphical_Models_in_Forecasting_Crude_Oil_Price.pdf
downloaded paper: ./corpus/Experimental_evaluation_of_a_silicone_oil_as_an_oxidation_inhibitor_for_magnesium_alloy_under_contact_sliding_at_elevated_temperatures.pdf
downloaded paper: ./corpus/Exploring_the_use_of_Transition_Path_Theory_in_building_an_oil_spill_prediction_scheme.pdf
downloaded paper: ./corpus/Exploration_of_Spanish_Olive_Oil_Quality_with_a_Miniaturized_Low-Cost_Fluorescence_Sensor_and_Machine_Learning_Techniques.pdf
downloaded paper: ./corpus/Olive_Oil_is_Made_of_Olives,_Baby_Oil_is_Made_for_Babies:_Interpreting_Noun_Compounds_using_Paraphrases_in_a_Neural_Model.pdf
downloaded paper: ./corpus/Stability_of_additive-free_water-in-oil_emulsions.pdf
downloaded paper: ./corpus/Pickering_emulsions_with_alpha-cyclodextrin_inclusions:_Structure_and_thermal_stability.pdf
downloaded paper: ./corpus/Physico-chemical_properties_extraction_from_the_fluorescence_spectrum_with_

downloaded paper: ./corpus/Probing_the_surface_of_synthetic_opals_with_the_vanadyl-containing_crude_oil_by_using_EPR_and_ENDOR_techniques.pdf
downloaded paper: ./corpus/Enhanced_dielectric_breakdown_performance_of_anatase_and_rutile_titania_based_nano-oils.pdf
downloaded paper: ./corpus/Dust_evolution,_a_global_view:_II._Top-down_branching,_nano-particle_fragmentation_and_the_mystery_of_the_diffuse_interstellar_band_carriers.pdf
downloaded paper: ./corpus/Single_beam_acoustical_tweezers_based_on_focused_beams:_A_numerical_analysis_of_2D_and_3D_trapping_capabilities.pdf
downloaded paper: ./corpus/Forecasting_the_abnormal_events_at_well_drilling_with_machine_learning.pdf
downloaded paper: ./corpus/Violent_music_vs_violence_and_music:_Drill_rap_and_violent_crime_in_London.pdf
downloaded paper: ./corpus/DREAMS:_Drilling_and_Extraction_Automated_System.pdf
downloaded paper: ./corpus/On_the_characterization_of_drilling_rotation_in_the_6-parameter_resultant_shell_theory.pdf
downloaded paper: 

downloaded paper: ./corpus/Maximal_Steered_Coherence_Protection_by_Quantum_Reservoir_Engineering.pdf
downloaded paper: ./corpus/Extracting_work_from_random_collisions:_A_model_of_a_quantum_heat_engine.pdf
downloaded paper: ./corpus/Designing_reservoirs_for_1t_decoherence_of_a_qubit.pdf
downloaded paper: ./corpus/Carnot's_theorem_for_nonthermal_stationary_reservoirs.pdf
downloaded paper: ./corpus/A_Micrometer-sized_Heat_Engine_Operating_Between_Bacterial_Reservoirs.pdf
downloaded paper: ./corpus/Efficiency_of_heat_engines_coupled_to_nonequilibrium_reservoirs.pdf
downloaded paper: ./corpus/Optimization_performance_of_quantum_Otto_heat_engines_and_refrigerators_with_squeezed_thermal_reservoirs.pdf
downloaded paper: ./corpus/Quantitative_supply_security_related_significance_measures_for_gas_reservoires.pdf
downloaded paper: ./corpus/Electromagnetically_Induced_Transparency_and_Quantum_Heat_Engines.pdf
downloaded paper: ./corpus/The_thermodynamics_governing_'endoreversible'_engines.pdf
down

downloaded paper: ./corpus/A_Business_Maturity_Model_of_Software_Product_Line_Engineering.pdf
downloaded paper: ./corpus/An_Architecture_Process_Maturity_Model_of_Software_Product_Line_Engineering.pdf
downloaded paper: ./corpus/Variability_and_Evolution_in_Systems_of_Systems.pdf
downloaded paper: ./corpus/Towards_a_Systems_Engineering_based_Automotive_Product_Engineering_Process.pdf
downloaded paper: ./corpus/Towards_a_Systems_Engineering_Essence.pdf
downloaded paper: ./corpus/Mind_the_Gap:_On_the_Relationship_Between_Automatically_Measured_and_Self-Reported_Productivity.pdf
downloaded paper: ./corpus/Resolving_code_smells_in_software_product_line_using_refactoring_and_reverse_engineering.pdf
downloaded paper: ./corpus/Fault-Tolerant_Dot-Product_Engines.pdf
downloaded paper: ./corpus/Data_Engineering_for_the_Analysis_of_Semiconductor_Manufacturing_Data.pdf
downloaded paper: ./corpus/Optimization_analysis_of_an_endoreversible_quantum_heat_engine_with_efficient_power_function.pdf
downloa

downloaded paper: ./corpus/Completions_of_Countable_Excellent_Domains_and_Countable_Noncatenary_Domains.pdf
downloaded paper: ./corpus/Generalized_existential_completions_and_their_regular_and_exact_completions.pdf
downloaded paper: ./corpus/Uniqueness_of_Instantaneously_Complete_Ricci_flows.pdf
downloaded paper: ./corpus/A_completeness_result_for_implicit_justification_stit_logic.pdf
downloaded paper: ./corpus/A_note_on_the_statistical_view_of_matrix_completion.pdf
downloaded paper: ./corpus/On_well-dominated_direct,_Cartesian_and_strong_product_graphs.pdf
downloaded paper: ./corpus/Dedekind_complete_and_order_continuous_Banach_$C(K)$-modules.pdf
downloaded paper: ./corpus/Smooth_prime_Fano_complete_intersections_in_toric_varieties.pdf
downloaded paper: ./corpus/Profinite_completions_and_MacNeille_completions_of_MV-algebras.pdf
downloaded paper: ./corpus/On_Stoltenberg's_quasi-uniform_completion.pdf
downloaded paper: ./corpus/Action_Completion:_A_Temporal_Model_for_Moment_Detection.pd

downloaded paper: ./corpus/Dynamic_and_Multi-functional_Labeling_Schemes.pdf
downloaded paper: ./corpus/Disciplined_Geometric_Programming.pdf
downloaded paper: ./corpus/Random_input_helps_searching_predecessors.pdf
downloaded paper: ./corpus/The_Log_Product_Formula.pdf
downloaded paper: ./corpus/Hyperbolicity_for_log_smooth_families_with_maximal_variation.pdf
downloaded paper: ./corpus/Large_gaps_between_primes.pdf
downloaded paper: ./corpus/Higher_order_corrections_for_anisotropic_bootstrap_percolation.pdf
downloaded paper: ./corpus/Improved_Parallel_Construction_of_Wavelet_Trees_and_RankSelect_Structures.pdf
downloaded paper: ./corpus/On_the_Profile_of_Multiplicities_of_Complete_Subgraphs.pdf
downloaded paper: ./corpus/Near-Optimal_(Euclidean)_Metric_Compression.pdf
downloaded paper: ./corpus/HyperLogLogLog:_Cardinality_Estimation_With_One_Log_More.pdf
downloaded paper: ./corpus/Anonymization_of_System_Logs_for_Privacy_and_Storage_Benefits.pdf
downloaded paper: ./corpus/Stringy_invar

downloaded paper: ./corpus/Geology_of_symmetric_grounds.pdf
downloaded paper: ./corpus/Geology_prediction_based_on_operation_data_of_TBM:_comparison_between_deep_neural_network_and_statistical_learning_methods.pdf
downloaded paper: ./corpus/From_Cosmic_Explosions_to_Terrestrial_Fires?:_A_Reply.pdf
downloaded paper: ./corpus/Semi-Automated_Segmentation_of_Geoscientific_Data_Using_Superpixels.pdf
downloaded paper: ./corpus/The_Non-Organic_Theory_of_the_Genesis_of_Petroleum.pdf
downloaded paper: ./corpus/Geophysical_tomography_in_engineering_geology:_an_overview.pdf
downloaded paper: ./corpus/Generating_Realistic_Geology_Conditioned_on_Physical_Measurements_with_Generative_Adversarial_Networks.pdf
downloaded paper: ./corpus/Universal_Graphs_at_$aleph_{ω_1+1}$_and_Set-theoretic_Geology.pdf
downloaded paper: ./corpus/Teaching_Waves_with_Google_Earth.pdf
downloaded paper: ./corpus/Density_vs_distance_for_the_DUNE_beam_from_two_recent_geology_density_maps.pdf
downloaded paper: ./corpus/Fast_r

downloaded paper: ./corpus/Feasibility_and_applications_of_the_spin-echo_modulation_option_for_a_small_angle_neutron_scattering_instrument_at_the_European_Spallation_Source.pdf
downloaded paper: ./corpus/Multiaxis_atom_interferometry_with_a_single_diode_laser_and_a_pyramidal_magneto-optical_trap.pdf
downloaded paper: ./corpus/Parametrization_and_generation_of_geological_models_with_generative_adversarial_networks.pdf
downloaded paper: ./corpus/First_Principles_Free-Energy_Theory_of_Solvation_with_Atomic_Scale_Liquid_Structure.pdf
downloaded paper: ./corpus/The_Sanford_Underground_Research_Facility.pdf
downloading paper: ./corpus/Exact_enumeration_approach_to_first-passage_time_distribution_of_non-Markov_random_walks.pdf
downloading paper: ./corpus/Atmospheric_muons_as_an_imaging_tool.pdf
downloading paper: ./corpus/On_the_Self-Similarity_of_Natural_Stochastic_Textures.pdf
downloaded paper: ./corpus/Microscopic_dynamics_and_failure_precursors_of_a_gel_under_mechanical_load.pdf
downloade

downloaded paper: ./corpus/Developing_a_seismic_pattern_interpretation_network_(SpiNet)_for_automated_seismic_interpretation.pdf
downloaded paper: ./corpus/Seismic_Negative_Belt_of_Acoustic_Metamaterials.pdf
downloaded paper: ./corpus/Quantitative_and_Qualitative_Seismic_Imaging_and_Seismic_Inversion.pdf
downloaded paper: ./corpus/Seismic_analysis_of_two_solar-type_stars_observed_by_Kepler.pdf
downloaded paper: ./corpus/Blind_Curvelet_based_Denoising_of_Seismic_Surveys_in_Coherent_and_Incoherent_Noise_Environments.pdf
downloaded paper: ./corpus/Toward_Creating_Subsurface_Camera.pdf
downloaded paper: ./corpus/Numerical_analysis_of_a_deep_learning_formulation_of_elastic_full_waveform_inversion_with_high_order_total_variation_regularization_in_different_parameterization.pdf
downloaded paper: ./corpus/A_convolutional_neural_network_for_prestack_fracture_detection.pdf
downloading paper: ./corpus/Acoustic_Full_Waveform_Inversion_with_Hamiltonian_Monte_Carlo_Method.pdf
downloaded paper: ./cor

downloaded paper: ./corpus/A_Dynamic_Sustainable_Competitive_Petroleum_Supply_Chain_Model_for_Various_Stakeholders_with_Shared_Facilities.pdf
downloaded paper: ./corpus/Petroleum_Refinery_Multi-Antenna_Propagation_Measurements.pdf
downloaded paper: ./corpus/Optimal_Economic_Operation_of_Liquid_Petroleum_Products_Pipeline_Systems.pdf
downloaded paper: ./corpus/Phase_behavior_in_petroleum_fluids_(A_Detailed_Descriptive_and_Illustrative_Account).pdf
downloaded paper: ./corpus/A_priori_error_analysis_for_transient_problems_using_Enhanced_Velocity_approach_in_the_discrete-time_setting.pdf
downloaded paper: ./corpus/The_Non-Organic_Theory_of_the_Genesis_of_Petroleum.pdf
downloaded paper: ./corpus/Impact_of_Recent_Discoveries_on_Petroleum_and_Natural_Gas_Exploration:_Emphasis_on_India.pdf
downloaded paper: ./corpus/Petroleum_prices_prediction_using_data_mining_techniques_--_A_Review.pdf
downloaded paper: ./corpus/Using_machine_learning_to_reduce_ensembles_of_geological_models_for_oil_and_gas_

downloaded paper: ./corpus/Intermittent_gravity-driven_flow_of_grains_through_narrow_pipes.pdf
downloaded paper: ./corpus/Critical_bursts_in_filtration.pdf
downloaded paper: ./corpus/Night_sky_brightness_above_Zagreb_2012.-2017.pdf
downloaded paper: ./corpus/Should_Engineers_be_Concerned_about_Vulnerability_of_Highway_Bridges_to_Potentially-Induced_Seismic_Hazards?.pdf
downloaded paper: ./corpus/Support_Vector_Machine_Application_for_Multiphase_Flow_Pattern_Prediction.pdf
downloaded paper: ./corpus/Asphaltene_aggregation_due_to_waterflooding_(A_molecular_dynamics_study).pdf
downloaded paper: ./corpus/Microfluidic_study_of_effects_of_flow_velocity_and_nutrient_concentration_on_biofilm_accumulation_and_adhesive_strength_in_a_microchannel.pdf
downloaded paper: ./corpus/Tensor-based_subspace_learning_for_tracking_salt-dome_boundaries.pdf
downloaded paper: ./corpus/Hydrocarbons_under_pressure:_phase_diagrams_and_surprising_new_compounds_in_the_C-H_system.pdf
downloaded paper: ./corpus/ALMA_

downloaded paper: ./corpus/Robust_Oil-spill_Forensics_and_Petroleum_Source_Differentiation_using_Quantized_Peak_Topography_Maps.pdf
downloaded paper: ./corpus/How_to_use_Big_Data_technologies_to_optimize_operations_in_Upstream_Petroleum_Industry.pdf
downloaded paper: ./corpus/Phase_behavior_in_petroleum_fluids_(A_Detailed_Descriptive_and_Illustrative_Account).pdf
downloaded paper: ./corpus/Multi-task_learning_for_virtual_flow_metering.pdf
downloaded paper: ./corpus/Vortex_flow_in_the_technology_of_radiation_wave_cracking_(RWC).pdf
downloaded paper: ./corpus/User_Association_in_User-Centric_Hybrid_VLCRF_Cell-Free_Massive_MIMO_Systems.pdf
downloaded paper: ./corpus/Industrial_applications_of_digital_rock_technology.pdf
downloaded paper: ./corpus/The_Non-Organic_Theory_of_the_Genesis_of_Petroleum.pdf
downloaded paper: ./corpus/Impact_of_Recent_Discoveries_on_Petroleum_and_Natural_Gas_Exploration:_Emphasis_on_India.pdf
downloaded paper: ./corpus/Petroleum_prices_prediction_using_data_minin

downloaded paper: ./corpus/Phase_behavior_in_petroleum_fluids_(A_Detailed_Descriptive_and_Illustrative_Account).pdf
downloaded paper: ./corpus/The_Evolution_of_Multicomponent_Systems_at_High_Pressures:_VI._The_Thermodynamic_Stability_of_the_Hydrogen-Carbon_System:_The_Genesis_of_Hydrocarbons_and_the_Origin_of_Petroleum.pdf
downloaded paper: ./corpus/Evaluation_of_Gaussian_approximations_for_data_assimilation_in_reservoir_models.pdf
downloaded paper: ./corpus/Critical_bursts_in_filtration.pdf
downloaded paper: ./corpus/The_Non-Organic_Theory_of_the_Genesis_of_Petroleum.pdf
downloaded paper: ./corpus/Impact_of_Recent_Discoveries_on_Petroleum_and_Natural_Gas_Exploration:_Emphasis_on_India.pdf
downloaded paper: ./corpus/Petroleum_prices_prediction_using_data_mining_techniques_--_A_Review.pdf
downloaded paper: ./corpus/Central_Schemes_for_Porous_Media_Flows.pdf
downloaded paper: ./corpus/On_the_upstream_mobility_scheme_for_two-phase_flow_in_porous_media.pdf
downloaded paper: ./corpus/A_nume

downloaded paper: ./corpus/Phase_behavior_in_petroleum_fluids_(A_Detailed_Descriptive_and_Illustrative_Account).pdf
downloaded paper: ./corpus/Thermo-kinetic_explosions:_safety_first_or_safety_last?.pdf
downloaded paper: ./corpus/The_Non-Organic_Theory_of_the_Genesis_of_Petroleum.pdf
downloaded paper: ./corpus/Impact_of_Recent_Discoveries_on_Petroleum_and_Natural_Gas_Exploration:_Emphasis_on_India.pdf
downloaded paper: ./corpus/Petroleum_prices_prediction_using_data_mining_techniques_--_A_Review.pdf
downloaded paper: ./corpus/The_application_of_high-resolution_3D_seismic_data_to_model_the_distribution_of_mechanical_and_hydrogeological_properties_of_a_potential_host_rock_for_the_deep_storage_of_radioactive_waste_in_France.pdf
downloaded paper: ./corpus/Deghosting_of_Ocean_Bottom_Cable_Data:Two_approaches.pdf
downloaded paper: ./corpus/A_Dynamic_Sustainable_Competitive_Petroleum_Supply_Chain_Model_for_Various_Stakeholders_with_Shared_Facilities.pdf
downloaded paper: ./corpus/Analytical_I

downloading paper: ./corpus/Learning-based_Initialization_Strategy_for_Safety_of_Multi-Vehicle_Systems.pdf
downloading paper: ./corpus/Joint_Synthesis_of_Safety_Certificate_and_Safe_Control_Policy_using_Constrained_Reinforcement_Learning.pdf
downloading paper: ./corpus/Strategy_to_Increase_the_Safety_of_a_DNN-based_Perception_for_HAD_Systems.pdf
downloading paper: ./corpus/Small-Gain_Theorem_for_Safety_Verification_of_Interconnected_Systems.pdf
downloading paper: ./corpus/Product_safety_idioms:_a_method_for_building_causal_Bayesian_networks_for_product_safety_and_risk_assessment.pdf
downloading paper: ./corpus/Quantifying_the_Safety_of_Trajectories_using_Peak-Minimizing_Control.pdf
downloaded paper: ./corpus/Safety_Practice_and_its_Practitioners:_Exploring_a_Diverse_Profession.pdf
downloaded paper: ./corpus/On_the_efficacy_of_safety-related_software_standards.pdf
downloading paper: ./corpus/Do_you_feel_safe_with_your_robot?_Factors_Influencing_Perceived_Safety_in_Human-Robot_Interactio

downloading paper: ./corpus/Domain-partitioned_element_management_systems_employing_mobile_agents_for_distributed_network_management.pdf
downloaded paper: ./corpus/Machine_Learning-Based_Soft_Sensors_for_Vacuum_Distillation_Unit.pdf
downloaded paper: ./corpus/Long-Term_Optimal_Delivery_Planning_for_Replacing_the_Liquefied_Petroleum_Gas_Cylinder.pdf
downloading paper: ./corpus/Datom:_Towards_modular_data_management.pdf
downloaded paper: ./corpus/Central_Schemes_for_Porous_Media_Flows.pdf
downloaded paper: ./corpus/On_the_upstream_mobility_scheme_for_two-phase_flow_in_porous_media.pdf
downloaded paper: ./corpus/Polysiloxane_surfactants_for_the_dispersion_of_carbon_nanotubes_in_non-polar_organic_solvents.pdf
downloaded paper: ./corpus/Dynamic_Decision_Making_for_Graphical_Models_Applied_to_Oil_Exploration.pdf
downloaded paper: ./corpus/Distributed_computing_of_Seismic_Imaging_Algorithms.pdf
downloaded paper: ./corpus/Titania_Prepared_by_Ball_Milling:_Its_Characterization_and_Application_a

In [9]:
# get a list of files downloaded to the directory to process
pdf_file_list = get_typed_files_in_directory(corpus_directory, "pdf")

In [10]:
# extract the text from the pdf files
for file in pdf_file_list:
    try:
        print(f"processing file {file}")
        text = extract_text_from_pdf(file)
        if text is not None and len(text) > 10:
            write_or_append_file(train_filename, preprocess_text(text))
        else:
            print(f"non text extract from file {file}")
    except:
        print(f"error with text extract from file {file}")

processing file ./corpus/Three-level_laser_heat_engine_at_optimal_performance_with_ecological_function.pdf
processing file ./corpus/Convergence_and_completeness_for_square-well_Stark_resonant_state_expansions.pdf
processing file ./corpus/Cellular_Memristive-Output_Reservoir_(CMOR).pdf
processing file ./corpus/A_new_design_strategy_based_on_a_deterministic_definition_of_the_seismic_input_to_overcome_the_limits_of_design_procedures_based_on_probabilistic_approaches.pdf
processing file ./corpus/Distributed_Kerr_Nonlinearity_in_a_Coherent_All-Optical_Fiber-Ring_Reservoir_Computer.pdf
processing file ./corpus/Completion_Time_in_Multi-Access_Channel:_An_Information_Theoretic_Perspective.pdf
processing file ./corpus/Machine_Learning_for_Software_Engineering:_A_Tertiary_Study.pdf
processing file ./corpus/Noncommutative_Schur-type_products_and_their_Schoenberg_theorem.pdf
processing file ./corpus/INSiDER:_Incorporation_of_system_and_safety_analysis_models_using_a_dedicated_reference_model.pdf
p

processing file ./corpus/Dissolved_gas_monitoring_probe_without_liquid-gas_separation_under_strong_electromagnetic_interference.pdf
processing file ./corpus/A_100%_Renewable_Energy_System:_Enabling_Zero_CO2_Emission_Offshore_Platforms.pdf
processing file ./corpus/Environmental_Factors_Influencing_Individual_Decision-Making_Behavior_in_Software_Project:_A_Systematic_Literature_Review.pdf
processing file ./corpus/Parametrisation_in_electrostatic_DPD_Dynamics_and_Applications.pdf
processing file ./corpus/Work_producing_reservoirs:_Stochastic_thermodynamics_with_generalized_Gibbs_ensembles.pdf
processing file ./corpus/Analyzing_the_effect_of_local_rounding_error_propagation_on_the_maximal_attainable_accuracy_of_the_pipelined_Conjugate_Gradient_method.pdf
processing file ./corpus/Dynamics_of_entangled_states_in_squeezed_reservoirs.pdf
processing file ./corpus/Fracture_Propagation_Driven_by_Fluid_Outflow_from_a_Low-permeability_Aquifer.pdf
processing file ./corpus/On_gray-box_modeling_for_vi

processing file ./corpus/Normative_Engineering_Risk_Management_Systems.pdf
processing file ./corpus/Discrete-time_signatures_and_randomness_in_reservoir_computing.pdf
processing file ./corpus/Critical_behavior_for_mixed_site-bond_directed_percolation.pdf
processing file ./corpus/The_Galactic_origin_for_the_borders_in_the_Earth_history.pdf
processing file ./corpus/The_Power_of_Language:_Understanding_Sentiment_Towards_the_Climate_Emergency_using_Twitter_Data.pdf
processing file ./corpus/Modeling_the_impact_of_dilution_on_the_microbial_degradation_time_of_dispersed_oil_in_marine_environments.pdf
processing file ./corpus/Meet-completions_and_ordered_domain_algebras.pdf
processing file ./corpus/Assessing_the_Impact_of_Offshore_Wind_Siting_Strategies_on_the_Design_of_the_European_Power_System.pdf
processing file ./corpus/Wormhole_formation_in_dissolving_fractures.pdf
processing file ./corpus/Numerical_computation_of_stress-permeability_relationships_of_fracture_networks_in_a_shale_rock.pdf


processing file ./corpus/Products_of_manifolds_with_fibered_corners.pdf
processing file ./corpus/Universal_Graphs_at_$aleph_{ω_1+1}$_and_Set-theoretic_Geology.pdf
processing file ./corpus/Forecasting_the_term_structure_of_crude_oil_futures_prices_with_neural_networks.pdf
processing file ./corpus/Extended_source_imaging,_a_unifying_framework_for_seismic_&_medical_imaging.pdf
processing file ./corpus/Review_of_medical_data_analysis_based_on_spiking_neural_networks.pdf
processing file ./corpus/Mutual_Information_and_the_Edge_of_Chaos_in_Reservoir_Computers.pdf
processing file ./corpus/On_the_Truncated_Pareto_Distribution_with_applications.pdf
processing file ./corpus/Task-agnostic_Exploration_in_Reinforcement_Learning.pdf
processing file ./corpus/A_Non-standard_Model_for_Microbial_Enhanced_Oil_Recovery_Including_the_Oil-water_Interfacial_Area.pdf
processing file ./corpus/Efficient_depth_extrapolation_of_waves_in_elastic_isotropic_media.pdf
processing file ./corpus/Do_Software_Languages_En

processing file ./corpus/A_Separated_Cohomologically_Complete_Module_is_Complete.pdf
processing file ./corpus/Generalizing_Interactive_Backpropagating_Refinement_for_Dense_Prediction.pdf
processing file ./corpus/Square_ice_in_graphene_nanocapillaries.pdf
processing file ./corpus/Wave_durationpersistence_statistics,_recording_interval,_and_fractal_dimension.pdf
processing file ./corpus/Learning_Exploration_Policies_for_Navigation.pdf
processing file ./corpus/New_characterizations_of_the_region_of_complete_localization_for_random_Schrödinger_operators.pdf
processing file ./corpus/ReinBo:_Machine_Learning_pipeline_search_and_configuration_with_Bayesian_Optimization_embedded_Reinforcement_Learning.pdf
processing file ./corpus/Sparse_SPN:_Depth_Completion_from_Sparse_Keypoints.pdf
processing file ./corpus/Singular_degenerate_problems_and_applications.pdf
processing file ./corpus/The_Non-Organic_Theory_of_the_Genesis_of_Petroleum.pdf
processing file ./corpus/A_Predictive_Model_for_Oil_Market

processing file ./corpus/Complete_internal_categories.pdf
processing file ./corpus/Influence_of_pore_pressure_to_the_development_of_a_hydraulic_fracture_in_poroelastic_medium.pdf
processing file ./corpus/Three-dimensional_seismic_characterization_and_imaging_of_the_Soda_Lake_geothermal_field.pdf
processing file ./corpus/Sign_of_the_Casimir-Polder_interaction_between_atoms_and_oil-water_interfaces:_Subtle_dependence_on_dielectric_properties.pdf
processing file ./corpus/Time-variant_Seismic_Resilience_Analysis_Model_for_Water_Distribution_Systems.pdf
processing file ./corpus/Progressive_Structure_from_Motion.pdf
processing file ./corpus/Study_on_performance_improvement_of_oil_paint_image_filter_algorithm_using_parallel_pattern_library.pdf
processing file ./corpus/METER-ML:_A_Multi-Sensor_Earth_Observation_Benchmark_for_Automated_Methane_Source_Mapping.pdf
processing file ./corpus/Model-Based_Compensation_of_Moving_Tissue_for_State_Recognition_in_Robotic-Assisted_Pedicle_Drilling.pdf
proc

processing file ./corpus/BYOL-Explore:_Exploration_by_Bootstrapped_Prediction.pdf
processing file ./corpus/From_cosmic_explosions_to_terrestrial_fires?.pdf
processing file ./corpus/Re-recognition_of_the_ideal_gas_and_real_gas.pdf
processing file ./corpus/Efficient_quantum_tensor_product_expanders_and_unitary_t-designs_via_the_zigzag_product.pdf
processing file ./corpus/On_the_distribution_of_monochromatic_complete_subgraphs_and_arithmetic_progressions.pdf
processing file ./corpus/Reservoir_Computing_Using_Complex_Systems.pdf
processing file ./corpus/On_doubly_warped_product_immersions.pdf
processing file ./corpus/Self-Exploration_in_Complex_Unknown_Environments_using_Hybrid_Map_Representation.pdf
processing file ./corpus/Deep_Reinforcement_Learning_with_Enhanced_Safety_for_Autonomous_Highway_Driving.pdf
processing file ./corpus/Non-Identity_Check_Remains_QMA-Complete_for_Short_Circuits.pdf
processing file ./corpus/Leveraging_Vision_Reconstruction_Pipelines_for_Satellite_Imagery.pdf
pro

processing file ./corpus/Implementing_Efficient_Message_Logging_Protocols_as_MPI_Application_Extensions.pdf
processing file ./corpus/Lower_bounds_for_piercing_and_coloring_boxes.pdf
processing file ./corpus/Deriving_environmental_contours_from_highest_density_regions.pdf
processing file ./corpus/A_Dirichlet_Regression_Model_for_Compositional_Data_with_Zeros.pdf
processing file ./corpus/Product_number_counting_statistics_from_stochastic_bursting_birth-death_processes.pdf
processing file ./corpus/Learning_to_Label_Seismic_Structures_with_Deconvolution_Networks_and_Weak_Labels.pdf
processing file ./corpus/A_novel_energy_factorization_approach_for_the_diffuse-interface_model_with_Peng-Robinson_equation_of_state.pdf
processing file ./corpus/LSM-DFN_Modeling_for_Seismic_Responses_in_Complex_Fractured_Media:_Comparison_of_Static_and_Dynamic_Elastic_Moduli.pdf
processing file ./corpus/Animal_inspired_Application_of_a_Variant_of_Mel_Spectrogram_for_Seismic_Data_Processing.pdf
processing file ./

processing file ./corpus/Completeness_in_Polylogarithmic_Time_and_Space.pdf
processing file ./corpus/Fractures_in_complex_fluids:_the_case_of_transient_networks.pdf
processing file ./corpus/A_Principal-Agent_Model_of_Systems_Engineering_Processes_with_Application_to_Satellite_Design.pdf
processing file ./corpus/Soliton_generation_by_local_resonance_interaction.pdf
processing file ./corpus/Focused_blind_deconvolution.pdf
processing file ./corpus/DSm_Vector_Spaces_of_Refined_Labels.pdf
processing file ./corpus/On_poroelastic_strain_energy_degradation_in_the_variational_phase--field_models_for_hydraulic_fracture.pdf
processing file ./corpus/Adaptive_Feed_Rate_Policies_for_Spiral_Drilling_Using_Markov_Decision_Process.pdf
processing file ./corpus/A_Decision_Support_System_for_Multi-target_Geosteering.pdf
processing file ./corpus/Multiresolution_Analysis_and_Learning_for_Computational_Seismic_Interpretation.pdf
processing file ./corpus/Moving_Fast_With_Broken_Data.pdf
processing file ./corp

processing file ./corpus/Estimating_oil_recovery_factor_using_machine_learning:_Applications_of_XGBoost_classification.pdf
processing file ./corpus/Virtual_Element_Method_for_geomechanics_on_reservoir_grids.pdf
processing file ./corpus/Software_Engineering_at_Google.pdf
processing file ./corpus/Understanding_the_Impact_of_Open-Framework_Conglomerates_on_Water-Oil_Displacements:_Victor_Interval_of_the_Ivishak_Reservoir,_Prudhoe_Bay_Field,_Alaska.pdf
processing file ./corpus/Physical_picture_for_fractures_in_stratified_materials:_viscoelastic_effects_in_large_cracks.pdf
processing file ./corpus/Eco_-_No(?)_-_Physics_-_comments_and_reflexions_-.pdf
processing file ./corpus/Performance_of_Reservoir_Discretizations_in_Quantum_Transport_Simulations.pdf
processing file ./corpus/Tools_and_Benchmarks_for_Automated_Log_Parsing.pdf
processing file ./corpus/Summarizing_Unstructured_Logs_in_Online_Services.pdf
processing file ./corpus/A_Finite-Volume_Discretization_for_Deformation_of_Fractured_Medi

processing file ./corpus/The_k-NN_algorithm_for_compositional_data:_a_revised_approach_with_and_without_zero_values_present.pdf
processing file ./corpus/Hydrodynamics_of_a_particle_model_in_contact_with_stochastic_reservoirs.pdf
processing file ./corpus/Infinite_Growth:_A_Curse_or_a_Blessing?.pdf
processing file ./corpus/Photometric_classification_of_HSC_transients_using_machine_learning.pdf
processing file ./corpus/On_the_existence_of_non-trivial_steady-state_size-distributions_for_a_class_of_flocculation_equations.pdf
processing file ./corpus/Semi-Automated_Segmentation_of_Geoscientific_Data_Using_Superpixels.pdf
processing file ./corpus/Vibration_Suppression_for_Coupled_Wave_PDEs_in_Deep-Sea_Construction.pdf
processing file ./corpus/Framed_duality_and_mirror_symmetry_for_toric_complete_intersections.pdf
processing file ./corpus/Experimental_study_on_the_gradual_fracture_of_layers_in_multi-layer_laminated_glass_plates_under_low-velocity_impact.pdf
processing file ./corpus/Granular_dy

processing file ./corpus/The_multi-objective_optimisation_of_breakwaters_using_evolutionary_approach.pdf
processing file ./corpus/Detection_of_vertebral_fractures_in_CT_using_3D_Convolutional_Neural_Networks.pdf
processing file ./corpus/Spatiotemporal_Modeling_of_Seismic_Images_for_Acoustic_Impedance_Estimation.pdf
processing file ./corpus/A_Regional_Oil_Extraction_and_Consumption_Model._Part_II:_Predicting_the_declines_in_regional_oil_consumption.pdf
processing file ./corpus/Nanoemulsions_obtained_via_bubble_bursting_at_a_compound_interface.pdf
processing file ./corpus/Theoretical_simulation_and_experimental_verification_of_dynamic_caustic_manipulation_using_a_deformable_mirror_for_laser_material_processing.pdf
processing file ./corpus/Fractional_Quantum_Heat_Engine.pdf
processing file ./corpus/Continuous_Cadence_Acquisition_of_the_LSST_Deep_Drilling_Fields.pdf
processing file ./corpus/Refined_Hopf_Link_Revisited.pdf
processing file ./corpus/Asymmetric_evolutionary_games_with_environm

processing file ./corpus/Shallow_Water_Bathymetry_Mapping_from_UAV_Imagery_based_on_Machine_Learning.pdf
processing file ./corpus/Prospects_for_Multi-omics_in_the_Microbial_Ecology_of_Water_Engineering.pdf
processing file ./corpus/Can_Artificial_Neural_Networks_be_Applied_in_Seismic_Predicition?_Preliminary_Analysis_Applying_Radial_Topology._Case:_Mexico.pdf
processing file ./corpus/Safety-Critical_Adaptation_in_Self-Adaptive_Systems.pdf
processing file ./corpus/The_Local_Hamiltonian_problem_on_a_line_with_eight_states_is_QMA-complete.pdf
processing file ./corpus/A_physically_based_model_for_the_electrical_conductivity_of_water-saturated_porous_media.pdf
processing file ./corpus/How_do_mobility_restrictions_and_social_distancing_during_COVID-19_affect_the_crude_oil_price?.pdf
processing file ./corpus/On_completeness_of_logic_programs.pdf
processing file ./corpus/On_refined_Young_inequalities.pdf
error with text extract from file ./corpus/On_refined_Young_inequalities.pdf
processing fil

processing file ./corpus/Mergers_of_Systems_Containing_Gas.pdf
processing file ./corpus/Additivity_of_multiple_heat_reservoirs_in_Langevin_equation.pdf
processing file ./corpus/Sanction_or_Financial_Crisis?_An_Artificial_Neural_Network-Based_Approach_to_model_the_impact_of_oil_price_volatility_on_Stock_and_industry_indices.pdf
processing file ./corpus/Gas_transport_in_partially-saturated_sand_packs.pdf
processing file ./corpus/Deep-learning_inversion:_a_next_generation_seismic_velocity-model_building_method.pdf
processing file ./corpus/Quantitative_Assessment_of_Adulteration_and_Reuse_of_Coconut_Oil_Using_Transmittance_Multispectral_Imaging.pdf
processing file ./corpus/Littelmann's_Refined_Demazure_Character_Formula_Revisited.pdf
processing file ./corpus/Psychological_Safety_and_Norm_Clarity_in_Software_Engineering_Teams.pdf
processing file ./corpus/Bucket-brigade_inspired_power_line_network_protocol_for_sensed_quantity_profile_acquisition_with_smart_sensors_deployed_as_a_queue_in_hars

processing file ./corpus/Topological_characteristics_of_oil_and_gas_reservoirs_and_their_applications.pdf
processing file ./corpus/Stability_of_the_Heisenberg_Product_on_Symmetric_Functions.pdf
processing file ./corpus/GPT-2C:_A_GPT-2_parser_for_Cowrie_honeypot_logs.pdf
processing file ./corpus/Learning_and_Visualizing_Localized_Geometric_Features_Using_3D-CNN:_An_Application_to_Manufacturability_Analysis_of_Drilled_Holes.pdf
processing file ./corpus/COVID-19_societal_response_captured_by_seismic_noise_in_China_and_Italy.pdf
processing file ./corpus/Software_Engineering_und_Software_Engineering_Forschung_im_Zeitalter_der_Digitalisierung.pdf
processing file ./corpus/Direct_current_resistivity_with_steel-cased_wells.pdf
processing file ./corpus/Experimental_Investigation_of_Proppant_Flow_and_Transport_Dynamics_Through_Fracture_Intersections.pdf
processing file ./corpus/Swimming_statistics_of_cargo-loaded_single_bacteria.pdf
processing file ./corpus/Large-scale_detection_and_categorizatio

processing file ./corpus/Mixed_problems_for_degenerate_abstract_parabolic_equations_and_applications.pdf
processing file ./corpus/Projection-based_resolved_interface_mixed-dimension_method_for_embedded_tubular_network_systems.pdf
processing file ./corpus/Multi-AGV's_Temporal_Memory-based_RRT_Exploration_in_Unknown_Environment.pdf
processing file ./corpus/A_Framework_for_the_Automated_Parameterization_of_a_Sensorless_Bearing_Fault_Detection_Pipeline.pdf
processing file ./corpus/On_the_completion_of_Skorokhod_space.pdf
processing file ./corpus/The_influence_of_density_of_modes_on_dark_lines_in_spontaneous_emission.pdf
processing file ./corpus/Incorporation_of_Strong_Motion_Duration_in_Incremental-based_Seismic_Assessments.pdf
processing file ./corpus/A_logico-linguistic_inquiry_into_the_foundations_of_physics:_Part_I.pdf
processing file ./corpus/Vertical_mixing_in_oil_spill_modelling.pdf
processing file ./corpus/Completion_of_skew_completable_unimodular_rows.pdf
processing file ./corpus/

processing file ./corpus/Management_of_Social_and_Economic_Development_of_Municipalities.pdf
processing file ./corpus/Almost_Optimal_Inapproximability_of_Multidimensional_Packing_Problems.pdf
processing file ./corpus/Enabling_the_self-contained_refrigerator_to_work_beyond_its_limits_by_filtering_the_reservoir.pdf
processing file ./corpus/Exploiting_Reuse_in_Pipeline-Aware_Hyperparameter_Tuning.pdf
processing file ./corpus/Automated_seismic-to-well_ties?.pdf
processing file ./corpus/Eastern_Arctic_ambient_noise_on_a_drifting_vertical_array.pdf
processing file ./corpus/Selecting_Optimal_Trace_Clustering_Pipelines_with_AutoML.pdf
processing file ./corpus/Combining_STPA_and_BDD_for_Safety_Analysis_and_Verification_in_Agile_Development:_A_Controlled_Experiment.pdf
processing file ./corpus/Enhanced_dielectric_breakdown_performance_of_anatase_and_rutile_titania_based_nano-oils.pdf
processing file ./corpus/Seizing_Opportunity:_Maintenance_Optimization_in_Offshore_Wind_Farms_Considering_Accessi

processing file ./corpus/State-of-the-art_SPH_solver_DualSPHysics:_from_fluid_dynamics_to_multiphysics_problems.pdf
processing file ./corpus/Model-Based_Safety-Cases_for_Software-Intensive_Systems.pdf
processing file ./corpus/Narratives_in_economics.pdf
processing file ./corpus/Equivalent_Classification_Mapping_for_Weakly_Supervised_Temporal_Action_Localization.pdf
processing file ./corpus/Joint_Synthesis_of_Safety_Certificate_and_Safe_Control_Policy_using_Constrained_Reinforcement_Learning.pdf
processing file ./corpus/Classification_of_complete_N=2_supersymmetric_theories_in_4_dimensions.pdf
processing file ./corpus/Deciding_Regularity_of_Hairpin_Completions_of_Regular_Languages_in_Polynomial_Time.pdf
processing file ./corpus/A_new_safety-guided_design_methodology_to_complement_model-based_safety_analysis_for_safety_assurance.pdf
processing file ./corpus/Seismic_Signatures_of_Stellar_Magnetic_Activity_--_What_Can_We_Expect_from_TESS?.pdf
processing file ./corpus/The_large_scale_impact

processing file ./corpus/Coverage_based_testing_for_V&V_and_Safety_Assurance_of_Self-driving_Autonomous_Vehicles:_A_Systematic_Literature_Review.pdf
processing file ./corpus/Products_of_rough_finite_state_machines.pdf
processing file ./corpus/GSM_based_CommSense_system_to_measure_and_estimate_environmental_changes.pdf
processing file ./corpus/Plumber:_Diagnosing_and_Removing_Performance_Bottlenecks_in_Machine_Learning_Data_Pipelines.pdf
processing file ./corpus/Polysiloxane_surfactants_for_the_dispersion_of_carbon_nanotubes_in_non-polar_organic_solvents.pdf
processing file ./corpus/Geophysical_inversions_to_delineate_rocks_with_CO$_2$_sequestration_potential_through_carbon_mineralization.pdf
processing file ./corpus/Microwave_probe_sensing_location_for_Venturi-based_real-time_multiphase_flowmeter.pdf
processing file ./corpus/Environmental_engineering_for_quantum_energy_transport.pdf
processing file ./corpus/Environmental_engineering_is_an_emergent_feature_of_diverse_ecosystems_and_driv

processing file ./corpus/Architecting_Safer_Autonomous_Aviation_Systems.pdf
processing file ./corpus/Fluid_flow_through_anisotropic_and_deformable_double_porosity_media_with_ultra-low_matrix_permeability:_A_continuum_framework.pdf
processing file ./corpus/Neolithic_stone_settlements_as_locally_resonant_metasurfaces.pdf
processing file ./corpus/Lignocellulosic_biomass:_a_sustainable_platform_for_the_production_of_bio-based_chemicals_and_polymers.pdf
processing file ./corpus/Evaluation_of_tsunami_wave_energy_generated_by_earthquakes_in_the_Makran_subduction_zone.pdf
processing file ./corpus/Business_Cycles_as_Collective_Risk_Fluctuations.pdf
processing file ./corpus/One-particle_engine_with_a_porous_piston.pdf
processing file ./corpus/Hydraulic_Fracture_Propagation_in_Naturally_Fractured_Reservoirs:_Complex_Fracture_or_Fracture_Networks.pdf
processing file ./corpus/Microdroplets_nucleation_by_dissolution_of_a_multicomponent_drop_in_a_host_liquid.pdf
processing file ./corpus/Scene_Complet

processing file ./corpus/Information_generating,_sharing_and_manipulating_Source-Reservoir-Sink_model_of_self-organizing_dissipative_structures.pdf
processing file ./corpus/Using_word_embedding_for_environmental_violation_analysis:_Evidence_from_Pennsylvania_unconventional_oil_and_gas_compliance_reports.pdf
processing file ./corpus/Fundamental_Limits_of_Stochastic_Shared_Caches_Networks.pdf
processing file ./corpus/A_Multi-Stencil_Fast_Marching_Method_with_Path_Correction_for_Efficient_Reservoir_Simulation_and_Automated_History_Matching.pdf
processing file ./corpus/Industrial_applications_of_digital_rock_technology.pdf
processing file ./corpus/Fast_approximate_simulation_of_seismic_waves_with_deep_learning.pdf
processing file ./corpus/A_Caporaso-Harris_type_Formula_for_relative_refined_invariants.pdf
processing file ./corpus/A_multiscale_flux_basis_for_mortar_mixed_discretizations_of_reduced_Darcy-Forchheimer_fracture_models.pdf
processing file ./corpus/Balance_Scene_Learning_Mechanism

processing file ./corpus/Completely_positive_tensor_decomposition.pdf
processing file ./corpus/A_lower_limit_for_Newtonian-noise_models_of_the_Einstein_Telescope.pdf
processing file ./corpus/Stochastic_solutions_of_Stefan_problems.pdf
processing file ./corpus/A_nano_heat_engine_beyond_the_Carnot_limit.pdf
processing file ./corpus/Non-Analyticity_and_the_van_der_Waals_Limit.pdf
processing file ./corpus/Bringing_AI_pipelines_onto_cloud-HPC:_setting_a_baseline_for_accuracy_of_COVID-19_AI_diagnosis.pdf
processing file ./corpus/Solver-Free_Heuristics_to_Retrieve_Feasible_Points_for_Offshore_Wind_Farm_Collection_System.pdf
processing file ./corpus/Towards_Better_Driver_Safety:_Empowering_Personal_Navigation_Technologies_with_Road_Safety_Awareness.pdf
processing file ./corpus/Refining_Inductive_Types.pdf
processing file ./corpus/Fractional_radial-cylindrical_diffusivity_model_for_levels_of_heterogeneity_in_petroleum_reservoirs.pdf
processing file ./corpus/Exploration_of_Self-Propelling_Drople

processing file ./corpus/Probabilistic_characterization_of_the_effect_of_transient_stochastic_loads_on_the_fatigue-crack_nucleation_time.pdf
processing file ./corpus/Completeness_for_vector_lattices.pdf
processing file ./corpus/Modeling_Big_Data-based_Systems_through_Ontological_Trading.pdf
processing file ./corpus/A_complete_Heyting_algebra_whose_Scott_space_is_non-sober.pdf
processing file ./corpus/Self-regulating_jets_during_the_Common_Envelope_phase.pdf
processing file ./corpus/Experimental_Correlation-Boosted_Quantum_Engine.pdf
processing file ./corpus/Existence_of_minimizers_in_the_geometrically_non-linear_6-parameter_resultant_shell_theory_with_drilling_rotations.pdf
processing file ./corpus/Global_weak_solutions_of_PDEs_for_compressible_media:_A_compactness_criterion_to_cover_new_physical_situations.pdf
processing file ./corpus/On_generalized_Ramsey_numbers_for_3-uniform_hypergraphs.pdf
processing file ./corpus/The_Effect_of_Oil_Price_on_United_Arab_Emirates_Goods_Trade_Deficit

processing file ./corpus/Blade-shaped_(PKN)_Hydraulic_Fracture_Driven_By_A_Turbulent_Fluid_In_An_Impermeable_Rock.pdf
processing file ./corpus/Refined_Topological_Branes.pdf
processing file ./corpus/Optimizing_Thermodynamic_Cycles_with_Two_Finite-Sized_Reservoirs.pdf
processing file ./corpus/3D_and_4D_printing_in_dentistry_and_maxillofacial_surgery:_Recent_advances_and_future_perspectives.pdf
processing file ./corpus/Spatial-temporal_water_area_monitoring_of_Miyun_Reservoir_using_remote_sensing_imagery_from_1984_to_2020.pdf
processing file ./corpus/Attainability_of_maximum_work_and_the_reversible_efficiency_from_minimally_nonlinear_irreversible_heat_engines.pdf
processing file ./corpus/Toe-Heal-Air-Injection_Thermal_Recovery_Production_Prediction_and_Modelling_Using_Quadratic_Poisson_Polynomial_Regression.pdf
processing file ./corpus/Numerical_study_on_thermal_transpiration_flows_through_a_rectangular_channel.pdf
processing file ./corpus/Skyrmion_based_energy_efficient_straintronic_phy

processing file ./corpus/Frequency-compensated_PINNs_for_Fluid-dynamic_Design_Problems.pdf
processing file ./corpus/A_Simple_Software_Application_for_Simulating_Commercially_Available_Solar_Panels.pdf
processing file ./corpus/A_random_model_for_the_Paley_graph.pdf
processing file ./corpus/EmptyHeaded:_A_Relational_Engine_for_Graph_Processing.pdf
processing file ./corpus/Viscous_fingering_in_fractured_porous_media.pdf
processing file ./corpus/A_Liouvile-type_theorems_for_some_classes_of_complete_Riemannian_almost_product_manifolds_and_for_special_mappings_of_complete_Riemannian_manifolds.pdf
processing file ./corpus/Seismic_indices_--_a_deep_look_inside_evolved_stars.pdf
processing file ./corpus/Flexible_Production_Systems:_Automated_Generation_of_Operations_Plans_Based_on_ISA-95_and_PDDL.pdf
processing file ./corpus/Star_Products_for_Relativistic_Quantum_Mechanics.pdf
processing file ./corpus/Development_of_a_fracture_capture_simulator_to_quantify_the_instability_evolution_in_porous_me

processing file ./corpus/Anti-fatigue-fracture_hydrogels.pdf
processing file ./corpus/Transient_Synchronization_Stability_Analysis_of_Wind_Farms_with_MMC-HVDC_Integration_Under_Offshore_AC_Grid_Fault.pdf
processing file ./corpus/Acceleration_of_the_NVT-flash_calculation_for_multicomponent_mixtures_using_deep_neural_network_models.pdf
processing file ./corpus/Breadth-First_Pipeline_Parallelism.pdf
processing file ./corpus/Self_Organizing_Maps_to_efficiently_cluster_and_functionally_interpret_protein_conformational_ensembles.pdf
processing file ./corpus/Statistical_Modeling_of_Pipeline_Delay_and_Design_of_Pipeline_under_Process_Variation_to_Enhance_Yield_in_sub-100nm_Technologies.pdf
processing file ./corpus/Thermal_Conductivity_from_Core_and_Well_log_Data.pdf
processing file ./corpus/Intermittent_gravity-driven_flow_of_grains_through_narrow_pipes.pdf
processing file ./corpus/Collaborative_Pipeline_Using_Opportunistic_Mobile_Resources_via_D2D_for_Computation-Intensive_Tasks.pdf
processin

processing file ./corpus/Complemented_subspaces_of_products_of_Banach_spaces.pdf
processing file ./corpus/A_framework_for_subsurface_monitoring_by_integrating_reservoir_simulation_with_time-lapse_seismic_surveys.pdf
processing file ./corpus/Adaptive_Safety_Margin_Estimation_for_Safe_Real-Time_Replanning_under_Time-Varying_Disturbance.pdf
processing file ./corpus/Refiner:_Refining_Self-attention_for_Vision_Transformers.pdf
processing file ./corpus/Joint_Optimization_of_Wind_Farm_Layout_Considering_Optimal_Control.pdf
processing file ./corpus/On_well-covered_direct_products.pdf
processing file ./corpus/Performance_Analysis_Of_Neuro_Genetic_Algorithm_Applied_On_Detecting_Proportion_Of_Components_In_Manhole_Gas_Mixture.pdf
processing file ./corpus/How_does_bad_and_good_volatility_spill_over_across_petroleum_markets?.pdf
processing file ./corpus/Ring_Exploration_with_Myopic_Luminous_Robots.pdf
processing file ./corpus/Control_Co-design_of_a_Hydrokinetic_Turbine_with_Open-loop_Optimal_Contro

processing file ./corpus/A_coupled_model_of_episodic_warming,_oxidation_and_geochemical_transitions_on_early_Mars.pdf
processing file ./corpus/Efficiency_and_robustness_in_Monte_Carlo_sampling_of_3-D_geophysical_inversions_with_Obsidian_v0.1.2:_Setting_up_for_success.pdf
processing file ./corpus/The_Quadratic_Wasserstein_Metric_With_Squaring_Scaling_For_Seismic_Velocity_Inversion.pdf
processing file ./corpus/Proceedings_Third_International_Workshop_on_Engineering_Safety_and_Security_Systems.pdf
error with text extract from file ./corpus/Proceedings_Third_International_Workshop_on_Engineering_Safety_and_Security_Systems.pdf
processing file ./corpus/Cavity-based_reservoir_engineering_for_Floquet-engineered_superconducting_circuits.pdf
processing file ./corpus/Development_of_A_Scalable_Platform_for_Large-scale_Reservoir_Simulations_on_Parallel_computers.pdf
processing file ./corpus/Refined_analytic_torsion:_comparison_theorems_and_examples.pdf
processing file ./corpus/Investigating_studen

processing file ./corpus/Cultural_Barriers_to_Software_Productivity_Practices_at_Los_Alamos.pdf
processing file ./corpus/A_concise_review_on_THGEM_detectors.pdf
processing file ./corpus/Recognizing_well-dominated_graphs_is_coNP-complete.pdf
processing file ./corpus/Young_alpha-enriched_giant_stars_in_the_solar_neighbourhood.pdf
processing file ./corpus/The_classification_of_smooth_well-formed_Fano_weighted_complete_intersections.pdf
processing file ./corpus/Complete_sets_and_completion_of_sets_in_Banach_spaces.pdf
processing file ./corpus/Modification_of_the_trapped_field_in_bulk_high-temperature_superconductors_as_a_result_of_the_drilling_of_a_pattern_of_artificial_columnar_holes.pdf
processing file ./corpus/An_End-to-End_Two-Phase_Deep_Learning-Based_workflow_to_Segment_Man-made_Objects_Around_Reservoirs.pdf
processing file ./corpus/Oil-US_Stock_Market_Nexus:_Some_insights_about_the_New_Coronavirus_Crisis.pdf
processing file ./corpus/The_squeezed_thermal_reservoir_as_a_generalized_eq

processing file ./corpus/Efficient_p-multigrid_spectral_element_model_for_water_waves_and_marine_offshore_structures.pdf
processing file ./corpus/Characterizing_Uncertainty_in_the_Visual_Text_Analysis_Pipeline.pdf
processing file ./corpus/Investigations_of_the_pi_N_total_cross_sections_at_high_energies_using_new_FESR:_log_nu_or_(log_nu)^2.pdf
processing file ./corpus/Covariant-Contravariant_Refinement_Modal_$μ$-calculus.pdf
processing file ./corpus/Software_Uncertainty_in_Integrated_Environmental_Modelling:_the_role_of_Semantics_and_Open_Science.pdf
processing file ./corpus/The_refined_BPS_index_from_stable_pair_invariants.pdf
processing file ./corpus/Towards_Personalized_Preprocessing_Pipeline_Search.pdf
processing file ./corpus/Unsupervised_seismic_facies_classification_using_deep_convolutional_autoencoder.pdf
processing file ./corpus/Sets_completely_separated_by_functions_in_Bishop_Set_Theory.pdf
processing file ./corpus/Multiscale_simulation_of_injection-induced_fracture_slip_and_w

processing file ./corpus/Optimizing_Quantum_Noise-induced_Reservoir_Computing_for_Nonlinear_and_Chaotic_Time_Series_Prediction.pdf
processing file ./corpus/Solving_non-Markovian_open_quantum_systems_with_multi-channel_reservoir_coupling.pdf
processing file ./corpus/Decoherence_suppression_of_a_dissipative_qubit_by_the_non-Markovian_effect.pdf
processing file ./corpus/The_inverse_problem_in_Seismology._Seismic_moment_and_energy_of_earthquakes._Seismic_hyperbola.pdf
processing file ./corpus/Exploring_the_near-surface_at_the_lunar_South_Pole_with_geophysical_tools.pdf
processing file ./corpus/On_the_Spaces_and_Dimensions_of_Geographical_Systems.pdf
processing file ./corpus/Existence_of_complete_Lyapunov_functions_for_semiflows_on_separable_metric_spaces.pdf
processing file ./corpus/On_completely_regular_codes_with_minimum_eigenvalue_in_geometric_graphs.pdf
processing file ./corpus/Macroeconomic_performance_of_oil_price_shocks_in_Russia.pdf
processing file ./corpus/Polarization_Modeling_an

processing file ./corpus/A_numerical_exploration_of_signal_detector_arrangement_in_a_spin-wave_reservoir_computing_device.pdf
processing file ./corpus/Memory_Safety_Preservation_for_WebAssembly.pdf
processing file ./corpus/On_the_Correlation_Between_Atomic_Gas_and_Bars_in_Galaxies.pdf
processing file ./corpus/Does_Global_seismic_energy_release_increase?_An_analysis_based_on_the_Lithospheric_Seismic_Energy_Flow_Model_(LSEFM)._The_case_of_mega_-_earthquakes_(M_>_9).pdf
processing file ./corpus/Repair_Pipelining_for_Erasure-Coded_Storage:_Algorithms_and_Evaluation.pdf
processing file ./corpus/Thompson_Sampling_for_(Combinatorial)_Pure_Exploration.pdf
processing file ./corpus/Beyond_NGS_data_sharing_and_towards_open_science.pdf
processing file ./corpus/Capillary_fracture_of_ultrasoft_gels:_heterogeneity_and_delayed_nucleation.pdf
processing file ./corpus/Probing_the_surface_of_synthetic_opals_with_the_vanadyl-containing_crude_oil_by_using_EPR_and_ENDOR_techniques.pdf
processing file ./corp

processing file ./corpus/On_the_Containment_Problem_for_Linear_Sets.pdf
processing file ./corpus/Exact_3D_seismic_data_reconstruction_using_Tubal-Alt-Min_algorithm.pdf
processing file ./corpus/Deep_recommender_engine_based_on_efficient_product_embeddings_neural_pipeline.pdf
processing file ./corpus/Self-Supervised_Exploration_via_Disagreement.pdf
processing file ./corpus/Critical_bursts_in_filtration.pdf
processing file ./corpus/Model-driven_Engineering_of_Safety_and_Security_Systems:_A_Systematic_Mapping_Study.pdf
processing file ./corpus/Designing_biodegradable_surfactants_and_effective_biomolecules_with_dissipative_particle_dynamics.pdf
processing file ./corpus/The_MKID_Science_Data_Pipeline.pdf
processing file ./corpus/Pseudo-2D_RANS:_A_LiDAR-driven_mid-fidelity_model_for_simulations_of_wind_farm_flows.pdf
processing file ./corpus/On_Edge-Partitioning_of_Complete_Geometric_Graphs_into_Plane_Trees.pdf
processing file ./corpus/Perspectives_of_cross_correlation_in_seismic_monitoring_a

In [6]:
# Need to convert the .bin file to a .vec file
# this takes a while
isExist = os.path.exists(VECTORS_FILEPATH)
if not isExist:
    fasttext.util.download_model('en', if_exists='ignore')  # English
    ft = fasttext.load_model('cc.en.300.bin')
    if dimensions != 300:
        ft = fasttext.util.reduce_model(ft, dimensions)
    
    lines=[]
    
    # get all words from model
    words = ft.get_words()
    
    with open(VECTORS_FILEPATH,'w') as file_out:
        
        # the first line must contain number of total words and vector dimension
        file_out.write(str(len(words)) + " " + str(ft.get_dimension()) + "\n")
    
        # line by line, you append vectors to VEC file
        for w in words:
            v = ft.get_word_vector(w)
            vstr = ""
            for vi in v:
                vstr += " " + str(vi)
            try:
                file_out.write(w + vstr+'\n')
            except:
                pass

In [7]:
# train (fine-tune) the model
model_ft = fasttext.train_unsupervised(input=train_filename, lr=.1, epoch=200, bucket=200000,
                             minn=3, maxn=6, wordNgrams=max_ngrams, dim=embedding_size, 
                             pretrainedVectors=VECTORS_FILEPATH, thread=6)

Read 9M words
Number of words:  54842
Number of labels: 0
Progress:  62.3% words/sec/thread:   49261 lr:  0.037652 avg.loss:  0.128338 ETA:   0h47m41s  4.7% words/sec/thread:   52199 lr:  0.095320 avg.loss:  0.567002 ETA:   1h53m56sm24s  7.5% words/sec/thread:   49259 lr:  0.092519 avg.loss:  0.520770 ETA:   1h57m11sh57m48s 10.6% words/sec/thread:   48833 lr:  0.089372 avg.loss:  0.423819 ETA:   1h54m11s  49297 lr:  0.082986 avg.loss:  0.332546 ETA:   1h45m 2s  49223 lr:  0.082000 avg.loss:  0.323680 ETA:   1h43m56s 18.5% words/sec/thread:   49147 lr:  0.081516 avg.loss:  0.319638 ETA:   1h43m29s 18.5% words/sec/thread:   49139 lr:  0.081450 avg.loss:  0.319041 ETA:   1h43m25s 19.3% words/sec/thread:   49158 lr:  0.080654 avg.loss:  0.312827 ETA:   1h42m22s 21.9% words/sec/thread:   49295 lr:  0.078063 avg.loss:  0.294919 ETA:   1h38m48s  49292 lr:  0.077334 avg.loss:  0.290491 ETA:   1h37m53s ETA:   1h37m40s  1h37m34s 23.1% words/sec/thread:   49216 lr:  0.076857 avg.loss:  0.287712 E

In [8]:
model_ft.save_model(MODEL_NAME)

In [9]:
model_ft = fasttext.load_model(MODEL_NAME)
words = model_ft.get_words()
# Get the word vectors
vectors = [model_ft.get_word_vector(word) for word in words]

In [10]:
# Reduce dimensions using UMAP
umap_reducer = umap.UMAP(n_components=2, random_state=42)
reduced_vectors = umap_reducer.fit_transform(vectors)

In [13]:
# Plot the words using the 2-dimensional vectors from UMAP
plt.figure(figsize=(12, 12))

for i, word in enumerate(words):
    plt.scatter(reduced_vectors[i, 0], reduced_vectors[i, 1])
    plt.annotate(word, (reduced_vectors[i, 0], reduced_vectors[i, 1]), fontsize=10)

plt.title('2D Visualization of Word Vectors using UMAP')
plt.xlabel('UMAP X')
plt.ylabel('UMAP Y')
plt.show()

In [14]:
# Create a dataframe for Datashader
data = pd.DataFrame(reduced_vectors, columns=['x', 'y'])
data['word'] = words

# Create a Datashader image
def create_image(x_range, y_range, w, h):
    cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
    agg = cvs.points(data, 'x', 'y', ds.count())
    img = tf.shade(agg, cmap=['lightblue', 'darkblue'], how='eq_hist')
    return tf.dynspread(img, threshold=0.5, max_px=4)

# Create a Bokeh plot
output_notebook()
p = figure(width=800, height=800, x_range=(-10, 10), y_range=(-10, 10),
           toolbar_location="above", tools="pan,wheel_zoom,box_zoom,reset")
p.axis.visible = False
InteractiveImage(p, create_image)
show(p)