# Synthesize a research paper
## Workflow
+ Define domain<br>
+ Build corpus<br>
+ Create embeddings model<br>
+ Perform WhiteSpace Analysis<br>
+ Identify best area for analysis<br>
+ Suggest topics for research<br>
+ Synthesize research paper content on selected topic

## Import Libraries

In [1]:
import os
import re
import string
import time

from arxiv import SortCriterion, SortOrder, Client, Search
import csv

import openai

import pdfplumber
import pytesseract
import fitz #pip install pymupdf
from PIL import Image
import io
import cv2
import numpy as np

from gensim.models.fasttext import FastText
import fasttext.util

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer

from sklearn.decomposition import PCA
# pip install umap-learn
import umap

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
2023-05-02 22:05:45.762100: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Inputs

In [2]:
# Define domain
domain = "petroleum engineering"
# misc definitions
corpus_directory = "./corpus"
max_papers = 100

In [3]:
 # Azure Instance
openai.api_key = ""
# your endpoint should look like the following https://YOUR_RESOURCE_NAME.openai.azure.com/
openai.api_base =  "" 
openai.api_version = '2022-12-01' # this may change in the future
openai.api_type = 'azure'
#This will correspond to the custom name you chose for your deployment when you deployed a model. 
deployment_name='' 

In [4]:
# Set the path to the tesseract executable
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract' 
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
stemmer = WordNetLemmatizer()
# required data and model downloads
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
en_stop = set(nltk.corpus.stopwords.words('english'))

# define model training parameters
# the number of dimensions is the number of vectors to create that represent a word
dimensions = 100
embedding_size = dimensions
window_size = 40
min_word = 3
down_sampling = 1e-2
min_ngrams = 3
max_ngrams = 6
enrich_vectors = 1

train_filename = domain.replace(" ", "_")+"_train.txt"
VECTORS_FILEPATH = "cc.en."+str(dimensions)+".vec"
MODEL_NAME = domain.replace(" ", "_")+"_model.bin"
PRETRAINED_MODEL = "cc.en."+str(dimensions)+".bin"

[nltk_data] Downloading package wordnet to /home/titan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/titan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/titan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/titan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/titan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/titan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Helper functions

In [5]:
# create a function to generate key words for research in the defined domain
def generate_research_keywords_for_domain(domain:str)->str:
    prompt = "Create comma seperated key words for research queries in the "+domain+" domain:"
    response = openai.Completion.create(
            engine=deployment_name,
            prompt=prompt,
            temperature=0.20,
            max_tokens=500
        )
    result = response.choices[0].text.strip()
    return result

def string_to_list(text:str)->str:
    """
    Converts a string of comma-separated values into a list of individual elements.
    """
    # Split the string into individual values using the comma separator
    values = text.split(',')
    
    # Remove any whitespace from the values
    values = [value.strip().replace(".", "") for value in values]
    
    # Return the list of values
    return values

def create_directory_if_not_exists(path:str):
    """
    Creates a directory at the specified path if it does not already exist.
    """
    if not os.path.exists(path):
        os.makedirs(path)
    return

# create a function to save papers by removing special characters from the file name
def clean_title(title:str):
    title = title.replace(" ", "_")
    title = title.replace("\\", "")
    title = title.replace("/", "")
    return title

def file_exists(path:str):
    """
    Checks if a file exists at the specified path.
    Returns True if the file exists, False otherwise.
    """
    return os.path.exists(path) and os.path.isfile(path)

def get_typed_files_in_directory(directory:str, file_type:str) -> list:
    """
    Returns a list of all files in the specified directory.
    """
    # Initialize an empty list to hold the PDF files
    typed_files = []

    # Loop over all files in the directory
    for filename in os.listdir(directory):
        # Check if the file has a .pdf extension
        if filename.lower().endswith(f".{file_type.lower()}"):
            # If it's a PDF file, add the full path to the list
            typed_files.append(os.path.join(directory, filename))

    # Return the list of PDF files
    return typed_files

def write_or_append_file(filepath, text):
    """
    Writes the specified text to a file, creating the file if it does not exist,
    or appends the text to the end of the file if it already exists.
    """
    mode = "a" if os.path.exists(filepath) else "w"
    with open(filepath, mode) as f:
        f.write(text)
        
def extract_text_from_pdf(filepath:str)->str:
    """
    Extracts text from a PDF file, using OCR if the PDF contains images.
    """
    # Open the PDF file using pdfplumber
    with pdfplumber.open(filepath) as pdf:
        # Initialize an empty string to hold the extracted text
        text = ""

        # Loop over all pages in the PDF
        for page in pdf.pages:
            # Extract the page text using pdfplumber
            page_text = page.extract_text()

            # If the page text is None, the page may contain images, so use OCR to extract the text
            if page_text is None:
                # Convert the page image to grayscale and apply thresholding to remove noise
                page_image = page.to_image(resolution=150)
                page_image = page_image.convert('L')
                page_image = page_image.point(lambda x: 0 if x < 180 else 255, '1')

                # Use pytesseract to extract the text from the image
                page_text = pytesseract.image_to_string(page_image)

            # Add the page text to the overall text
            text += page_text

    # Return the extracted text
    return text

def rotate_image(image, angle):
    """
    Rotate an image by a given angle.

    Args:
        image (np.array): The image to rotate.
        angle (float): The angle to rotate the image by.

    Returns:
        np.array: The rotated image.
    """
    rows, cols = image.shape[:2]
    rotation_matrix = cv2.getRotationMatrix2D((cols / 2, rows / 2), angle, 1)
    return cv2.warpAffine(image, rotation_matrix, (cols, rows))

def correct_skew(image):
    """
    Correct skew in an image.

    Args:
        image (np.array): The image to correct skew for.

    Returns:
        np.array: The image with corrected skew.
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.bitwise_not(gray)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

    coords = np.column_stack(np.where(thresh > 0))
    angle = cv2.minAreaRect(coords)[-1]

    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle

    return rotate_image(image, angle)

def extract_text_from_pdf(pdf_path):
    """
    Extract text from an image-based PDF using the pymupdf library.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        str: The extracted text from the PDF.
    """
    # Open the PDF file
    doc = fitz.open(pdf_path)

    # Initialize an empty string to store the extracted text
    extracted_text = ""

    # Loop through each page in the PDF
    for page in doc:
        # Extract text from the page and append it to the extracted_text string
        extracted_text += page.get_text("text")

    # Close the PDF file
    doc.close()

    return extracted_text

def preprocess_text(document):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))
        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)
        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)
        # standardize white space
        document = re.sub(r'\s+', ' ', document)
        # Removing linebreaks '\n'
        document = re.sub(r'[\r\n]+', '', document)
        
        ## remove digits with regular expression
        document = re.sub(r'\d', ' ', document)
        #MM remove any patterns matching standard url format
        url_pattern = r'((http|ftp|https):\/\/)?[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?'
        document = re.sub(url_pattern, ' ', document)
        #MM remove all non-ascii characters
        document = ''.join(character for character in document if ord(character)<128)
        #MM filter funny characters, if any.
        printable = set(string.printable)
        document = filter(lambda x: x in printable, document) #filter funny characters, if any.
        document = "".join(list(document))
        

        # Converting to Lowercase
        document = document.lower().translate(remove_punct_dict)
        
        

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        #Remove all short words that have a length < 3 character
        tokens = [word for word in tokens if len(word) > 3]
        # remove remaining tokens that are not alphabetic
        tokens = [word for word in tokens if word.isalpha()]

        preprocessed_text = ' '.join(tokens)

        return preprocessed_text
    
def loadModel(modelName):
    model = FastText.load(modelName)
    return model

## Build corpus
Here we will build a corpus of documents from ARXIV papers<br>
To search papers, generate a list of keywords of research areas in the defined domain

In [6]:
keywords = generate_research_keywords_for_domain(domain)
keywords = string_to_list(keywords)

In [7]:
keywords

['petroleum engineering',
 'oil exploration',
 'drilling',
 'reservoir engineering',
 'production engineering',
 'well completion',
 'well logging',
 'geology',
 'seismic surveying',
 'petroleum economics',
 'petroleum refining',
 'petroleum geology',
 'petroleum technology',
 'petroleum law',
 'petroleum safety',
 'petroleum management']

In [None]:
# download papers in each research area
create_directory_if_not_exists(corpus_directory)
for searchterm in keywords:
    # create arxiv client
    big_slow_client = Client(
      page_size = 1000,
      delay_seconds = 10,
      num_retries = 5
    )
    for result in big_slow_client.results(Search(query=searchterm, max_results = max_papers, sort_by = SortCriterion.Relevance, sort_order = SortOrder.Ascending)):
        filename = clean_title(result.title)+".pdf"
        filepath = os.path.join(corpus_directory, filename)
        try:
            if file_exists(filepath):
                print("downloaded paper: "+filepath)
            else:
                print("downloading paper: "+filepath)
                result.download_pdf(dirpath=corpus_directory, filename=filename)
                time.sleep(2)
        except:
            print("Error downloading paper: "+filename)
            time.sleep(10)
            continue

downloaded paper: ./corpus/Optimal_Economic_Operation_of_Liquid_Petroleum_Products_Pipeline_Systems.pdf
downloaded paper: ./corpus/Phase_behavior_in_petroleum_fluids_(A_Detailed_Descriptive_and_Illustrative_Account).pdf
downloaded paper: ./corpus/On_the_Determination_of_the_Solar_Rotation_Elements_i,_Ω_and_Period_using_Sunspot_Observations_by_Ruđer_Bošković_in_1777.pdf
downloaded paper: ./corpus/On_gray-box_modeling_for_virtual_flow_metering.pdf
downloaded paper: ./corpus/Reaction_Mechanisms_in_Petroleum:_From_Experimentation_to_Upgrading_and_Geological_Conditions.pdf
downloaded paper: ./corpus/Simulation_of_incompressible_two-phase_flow_in_porous_media_with_large_timesteps.pdf
downloaded paper: ./corpus/Multi-task_learning_for_virtual_flow_metering.pdf
downloaded paper: ./corpus/Forecasting_the_production_of_Distillate_Fuel_Oil_Refinery_and_Propane_Blender_net_production_by_using_Time_Series_Algorithms.pdf
downloaded paper: ./corpus/A_Decision_Support_System_for_Multi-target_Geosteeri

downloaded paper: ./corpus/Application_of_Probabilistic_Graphical_Models_in_Forecasting_Crude_Oil_Price.pdf
downloaded paper: ./corpus/Experimental_evaluation_of_a_silicone_oil_as_an_oxidation_inhibitor_for_magnesium_alloy_under_contact_sliding_at_elevated_temperatures.pdf
downloaded paper: ./corpus/Exploring_the_use_of_Transition_Path_Theory_in_building_an_oil_spill_prediction_scheme.pdf
downloaded paper: ./corpus/Exploration_of_Spanish_Olive_Oil_Quality_with_a_Miniaturized_Low-Cost_Fluorescence_Sensor_and_Machine_Learning_Techniques.pdf
downloaded paper: ./corpus/Olive_Oil_is_Made_of_Olives,_Baby_Oil_is_Made_for_Babies:_Interpreting_Noun_Compounds_using_Paraphrases_in_a_Neural_Model.pdf
downloaded paper: ./corpus/Stability_of_additive-free_water-in-oil_emulsions.pdf
downloaded paper: ./corpus/Pickering_emulsions_with_alpha-cyclodextrin_inclusions:_Structure_and_thermal_stability.pdf
downloaded paper: ./corpus/Physico-chemical_properties_extraction_from_the_fluorescence_spectrum_with_

downloaded paper: ./corpus/Probing_the_surface_of_synthetic_opals_with_the_vanadyl-containing_crude_oil_by_using_EPR_and_ENDOR_techniques.pdf
downloaded paper: ./corpus/Enhanced_dielectric_breakdown_performance_of_anatase_and_rutile_titania_based_nano-oils.pdf
downloaded paper: ./corpus/Dust_evolution,_a_global_view:_II._Top-down_branching,_nano-particle_fragmentation_and_the_mystery_of_the_diffuse_interstellar_band_carriers.pdf
downloaded paper: ./corpus/Single_beam_acoustical_tweezers_based_on_focused_beams:_A_numerical_analysis_of_2D_and_3D_trapping_capabilities.pdf
downloaded paper: ./corpus/Forecasting_the_abnormal_events_at_well_drilling_with_machine_learning.pdf
downloaded paper: ./corpus/Violent_music_vs_violence_and_music:_Drill_rap_and_violent_crime_in_London.pdf
downloaded paper: ./corpus/DREAMS:_Drilling_and_Extraction_Automated_System.pdf
downloaded paper: ./corpus/On_the_characterization_of_drilling_rotation_in_the_6-parameter_resultant_shell_theory.pdf
downloaded paper: 

downloaded paper: ./corpus/Maximal_Steered_Coherence_Protection_by_Quantum_Reservoir_Engineering.pdf
downloaded paper: ./corpus/Extracting_work_from_random_collisions:_A_model_of_a_quantum_heat_engine.pdf
downloaded paper: ./corpus/Designing_reservoirs_for_1t_decoherence_of_a_qubit.pdf
downloaded paper: ./corpus/Carnot's_theorem_for_nonthermal_stationary_reservoirs.pdf
downloaded paper: ./corpus/A_Micrometer-sized_Heat_Engine_Operating_Between_Bacterial_Reservoirs.pdf
downloaded paper: ./corpus/Efficiency_of_heat_engines_coupled_to_nonequilibrium_reservoirs.pdf
downloaded paper: ./corpus/Optimization_performance_of_quantum_Otto_heat_engines_and_refrigerators_with_squeezed_thermal_reservoirs.pdf
downloaded paper: ./corpus/Quantitative_supply_security_related_significance_measures_for_gas_reservoires.pdf
downloaded paper: ./corpus/Electromagnetically_Induced_Transparency_and_Quantum_Heat_Engines.pdf
downloaded paper: ./corpus/The_thermodynamics_governing_'endoreversible'_engines.pdf
down

downloaded paper: ./corpus/A_Business_Maturity_Model_of_Software_Product_Line_Engineering.pdf
downloaded paper: ./corpus/An_Architecture_Process_Maturity_Model_of_Software_Product_Line_Engineering.pdf
downloaded paper: ./corpus/Variability_and_Evolution_in_Systems_of_Systems.pdf
downloaded paper: ./corpus/Towards_a_Systems_Engineering_based_Automotive_Product_Engineering_Process.pdf
downloaded paper: ./corpus/Towards_a_Systems_Engineering_Essence.pdf
downloaded paper: ./corpus/Mind_the_Gap:_On_the_Relationship_Between_Automatically_Measured_and_Self-Reported_Productivity.pdf
downloaded paper: ./corpus/Resolving_code_smells_in_software_product_line_using_refactoring_and_reverse_engineering.pdf
downloaded paper: ./corpus/Fault-Tolerant_Dot-Product_Engines.pdf
downloaded paper: ./corpus/Data_Engineering_for_the_Analysis_of_Semiconductor_Manufacturing_Data.pdf
downloaded paper: ./corpus/Optimization_analysis_of_an_endoreversible_quantum_heat_engine_with_efficient_power_function.pdf
downloa

downloaded paper: ./corpus/Completions_of_Countable_Excellent_Domains_and_Countable_Noncatenary_Domains.pdf
downloaded paper: ./corpus/Generalized_existential_completions_and_their_regular_and_exact_completions.pdf
downloaded paper: ./corpus/Uniqueness_of_Instantaneously_Complete_Ricci_flows.pdf
downloaded paper: ./corpus/A_completeness_result_for_implicit_justification_stit_logic.pdf
downloaded paper: ./corpus/A_note_on_the_statistical_view_of_matrix_completion.pdf
downloaded paper: ./corpus/On_well-dominated_direct,_Cartesian_and_strong_product_graphs.pdf
downloaded paper: ./corpus/Dedekind_complete_and_order_continuous_Banach_$C(K)$-modules.pdf
downloaded paper: ./corpus/Smooth_prime_Fano_complete_intersections_in_toric_varieties.pdf
downloaded paper: ./corpus/Profinite_completions_and_MacNeille_completions_of_MV-algebras.pdf
downloaded paper: ./corpus/On_Stoltenberg's_quasi-uniform_completion.pdf
downloaded paper: ./corpus/Action_Completion:_A_Temporal_Model_for_Moment_Detection.pd

downloaded paper: ./corpus/Dynamic_and_Multi-functional_Labeling_Schemes.pdf
downloaded paper: ./corpus/Disciplined_Geometric_Programming.pdf
downloaded paper: ./corpus/Random_input_helps_searching_predecessors.pdf
downloaded paper: ./corpus/The_Log_Product_Formula.pdf
downloaded paper: ./corpus/Hyperbolicity_for_log_smooth_families_with_maximal_variation.pdf
downloaded paper: ./corpus/Large_gaps_between_primes.pdf
downloaded paper: ./corpus/Higher_order_corrections_for_anisotropic_bootstrap_percolation.pdf
downloaded paper: ./corpus/Improved_Parallel_Construction_of_Wavelet_Trees_and_RankSelect_Structures.pdf
downloaded paper: ./corpus/On_the_Profile_of_Multiplicities_of_Complete_Subgraphs.pdf
downloaded paper: ./corpus/Near-Optimal_(Euclidean)_Metric_Compression.pdf
downloaded paper: ./corpus/HyperLogLogLog:_Cardinality_Estimation_With_One_Log_More.pdf
downloaded paper: ./corpus/Anonymization_of_System_Logs_for_Privacy_and_Storage_Benefits.pdf
downloaded paper: ./corpus/Stringy_invar

downloaded paper: ./corpus/Geology_of_symmetric_grounds.pdf
downloaded paper: ./corpus/Geology_prediction_based_on_operation_data_of_TBM:_comparison_between_deep_neural_network_and_statistical_learning_methods.pdf
downloaded paper: ./corpus/From_Cosmic_Explosions_to_Terrestrial_Fires?:_A_Reply.pdf
downloaded paper: ./corpus/Semi-Automated_Segmentation_of_Geoscientific_Data_Using_Superpixels.pdf
downloaded paper: ./corpus/The_Non-Organic_Theory_of_the_Genesis_of_Petroleum.pdf
downloaded paper: ./corpus/Geophysical_tomography_in_engineering_geology:_an_overview.pdf
downloaded paper: ./corpus/Generating_Realistic_Geology_Conditioned_on_Physical_Measurements_with_Generative_Adversarial_Networks.pdf
downloaded paper: ./corpus/Universal_Graphs_at_$aleph_{ω_1+1}$_and_Set-theoretic_Geology.pdf
downloaded paper: ./corpus/Teaching_Waves_with_Google_Earth.pdf
downloaded paper: ./corpus/Density_vs_distance_for_the_DUNE_beam_from_two_recent_geology_density_maps.pdf
downloaded paper: ./corpus/Fast_r

downloaded paper: ./corpus/Feasibility_and_applications_of_the_spin-echo_modulation_option_for_a_small_angle_neutron_scattering_instrument_at_the_European_Spallation_Source.pdf
downloaded paper: ./corpus/Multiaxis_atom_interferometry_with_a_single_diode_laser_and_a_pyramidal_magneto-optical_trap.pdf
downloaded paper: ./corpus/Parametrization_and_generation_of_geological_models_with_generative_adversarial_networks.pdf
downloaded paper: ./corpus/First_Principles_Free-Energy_Theory_of_Solvation_with_Atomic_Scale_Liquid_Structure.pdf
downloaded paper: ./corpus/The_Sanford_Underground_Research_Facility.pdf
downloading paper: ./corpus/Exact_enumeration_approach_to_first-passage_time_distribution_of_non-Markov_random_walks.pdf
downloading paper: ./corpus/Atmospheric_muons_as_an_imaging_tool.pdf
downloading paper: ./corpus/On_the_Self-Similarity_of_Natural_Stochastic_Textures.pdf
downloaded paper: ./corpus/Microscopic_dynamics_and_failure_precursors_of_a_gel_under_mechanical_load.pdf
downloade

In [None]:
# get a list of files downloaded to the directory to process
pdf_file_list = get_typed_files_in_directory(corpus_directory, "pdf")

In [None]:
# extract the text from the pdf files
for file in pdf_file_list:
    try:
        print(f"processing file {file}")
        text = extract_text_from_pdf(file)
        if text is not None and len(text) > 10:
            write_or_append_file(train_filename, preprocess_text(text))
        else:
            print(f"non text extract from file {file}")
    except:
        print(f"error with text extract from file {file}")

In [None]:
# Need to convert the .bin file to a .vec file
# this takes a while
isExist = os.path.exists(VECTORS_FILEPATH)
if not isExist:
    fasttext.util.download_model('en', if_exists='ignore')  # English
    ft = fasttext.load_model('cc.en.300.bin')
    
    lines=[]
    
    # get all words from model
    words = ft.get_words()
    
    with open(VECTORS_FILEPATH,'w') as file_out:
        
        # the first line must contain number of total words and vector dimension
        file_out.write(str(len(words)) + " " + str(ft.get_dimension()) + "\n")
    
        # line by line, you append vectors to VEC file
        for w in words:
            v = ft.get_word_vector(w)
            vstr = ""
            for vi in v:
                vstr += " " + str(vi)
            try:
                file_out.write(w + vstr+'\n')
            except:
                pass

In [None]:
# train (fine-tune) the model
model_ft = fasttext.train_supervised(input=train_filename, lr=.1, epoch=200,
                             wordNgrams=max_ngrams, bucket=200000, dim=embedding_size, 
                             pretrainedVectors=VECTORS_FILEPATH)

In [None]:
model_ft.save_model(MODEL_NAME)

In [None]:
model_ft = fasttext.load_model(MODEL_NAME)

In [None]:
words = model_ft.get_words()

In [None]:
# Get the word vectors
vectors = [model_ft.get_word_vector(word) for word in words]

In [None]:
# Reduce dimensions using UMAP
umap_reducer = umap.UMAP(n_components=2, random_state=42)
reduced_vectors = umap_reducer.fit_transform(vectors)

In [None]:
# Plot the words using the 2-dimensional vectors from UMAP
plt.figure(figsize=(12, 12))

for i, word in enumerate(words):
    plt.scatter(reduced_vectors[i, 0], reduced_vectors[i, 1])
    plt.annotate(word, (reduced_vectors[i, 0], reduced_vectors[i, 1]), fontsize=10)

plt.title('2D Visualization of Word Vectors using UMAP')
plt.xlabel('UMAP X')
plt.ylabel('UMAP Y')
plt.show()