In [1]:
import os
import string

import PyPDF2
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

from gensim.models import Word2Vec
import umap

import holoviews as hv
import datashader as ds
from holoviews.operation.datashader import datashade, spread
from bokeh.models import HoverTool
hv.extension('bokeh')

import plotly.graph_objs as go

d:\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
d:\Anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shutt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def extract_text_from_pdf(file_path):
    pdf_file = open(file_path, 'rb')
    read_pdf = PyPDF2.PdfReader(pdf_file)
    number_of_pages = len(read_pdf.pages)
    text = ""
    for page_number in range(number_of_pages):
        page = read_pdf.pages[page_number]
        page_content = page.extract_text ()
        text += page_content
    return text

def get_sentences_from_dir(directory):
    all_sentences = []
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            text = extract_text_from_pdf(os.path.join(directory, filename))
            sentences = sent_tokenize(text)
            all_sentences.extend(sentences)
    return all_sentences

def process_text(sentences):
    table = str.maketrans('', '', string.punctuation)
    processed_text = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tokens = [w.lower() for w in tokens]
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        processed_text.append(words)
    return processed_text

def remove_punctuation(sentence):
    # Create a translation table mapping every punctuation character to None
    translator = str.maketrans('', '', string.punctuation)
    
    # Use the table to remove all punctuation from the sentence
    no_punct = sentence.translate(translator)
    return no_punct

In [5]:
directory_path = "D:\\repos\\urtec\\2023"
sentences = get_sentences_from_dir(directory_path)
processed_text = process_text(sentences)
processed_text

[['urtec',
  'gas',
  'hydrate',
  'structure',
  'evolution',
  'revealed',
  'by',
  'coarsening',
  'molecular',
  'dynamics',
  'simulation',
  'hao',
  'xiong',
  'hao',
  'jing'],
 [],
 ['yale', 'university'],
 ['hohai'],
 ['colorado',
  'school',
  'of',
  'mines',
  'copyright',
  'unconventional',
  'resources',
  'technology',
  'conference',
  'urtec',
  'doi',
  'this',
  'paper',
  'was',
  'prepared',
  'for',
  'presentation',
  'at',
  'the',
  'unconventional',
  'resources',
  'technology',
  'conference',
  'held',
  'in',
  'denver',
  'colorado',
  'usa',
  'june'],
 ['the',
  'urtec',
  'technical',
  'program',
  'committee',
  'accepted',
  'this',
  'presentation',
  'on',
  'the',
  'basis',
  'of',
  'information',
  'contained',
  'in',
  'an',
  'abstract',
  'submitted',
  'by',
  'the',
  'author',
  's'],
 ['the',
  'contents',
  'of',
  'this',
  'paper',
  'have',
  'not',
  'been',
  'reviewed',
  'by',
  'urtec',
  'and',
  'urtec',
  'does',
  'not'

In [6]:
custom_model = Word2Vec(processed_text, min_count=3, vector_size=300, workers=10, window=7, epochs=200)

In [14]:
custom_model.save('urtec.model')

In [15]:
custom_model.wv.most_similar('frac', topn=10)

[('fracture', 0.37705597281455994),
 ('well', 0.3348378539085388),
 ('completion', 0.3298324644565582),
 ('prefrac', 0.3199969530105591),
 ('stimulation', 0.31596657633781433),
 ('diminish', 0.2915516793727875),
 ('rac', 0.2768298387527466),
 ('fracturing', 0.27543920278549194),
 ('parent', 0.27295905351638794),
 ('fractures', 0.2691037952899933)]

In [3]:
custom_model = Word2Vec.load('urtec.model')

In [4]:
custom_model.wv.most_similar('frac', topn=10)

[('fracture', 0.37705597281455994),
 ('well', 0.3348378539085388),
 ('completion', 0.3298324644565582),
 ('prefrac', 0.3199969530105591),
 ('stimulation', 0.31596657633781433),
 ('diminish', 0.2915516793727875),
 ('rac', 0.2768298387527466),
 ('fracturing', 0.27543920278549194),
 ('parent', 0.27295905351638794),
 ('fractures', 0.2691037952899933)]

In [5]:
# Get the word vectors
word_vectors = custom_model.wv.vectors

# Use UMAP to reduce dimensions
umap_data = umap.UMAP(n_neighbors=5, min_dist=0.3, n_components=2).fit_transform(word_vectors)

In [8]:
# Prepare data for the plot
trace = go.Scatter(
    x = umap_data[:,0],
    y = umap_data[:,1],
    mode = 'markers',
    text = custom_model.wv.index_to_key,  # This will be displayed when a point is hovered over
    marker = dict(
        color = '#FFBAD2',
        line = dict(width = 1)
    )
)

# Define the layout
layout = go.Layout(
    title='Word Embeddings',
    xaxis=dict(title='UMAP Dimension 1'),
    yaxis=dict(title='UMAP Dimension 2')
)

# Define the figure
fig = go.Figure(data=[trace], layout=layout)

# Render the plot
fig.show()

In [14]:
# Define words
words = ['feynman', 'physics', 'data', 'intelligence']

# Get the top 20 most similar words to each word in the list
similar_words = [custom_model.wv.most_similar(word, topn=20) for word in words]

# Flatten the list of similar words and add the original words
all_words = [word for similar in similar_words for word, _ in similar] + words

# Get the vectors for each word
all_vectors = custom_model.wv[all_words]

# Reduce dimensionality
umap_data = umap.UMAP(n_neighbors=5, min_dist=0.3, n_components=2).fit_transform(all_vectors)

# Prepare traces for the plot
traces = []
for i, word in enumerate(words):
    # Get the UMAP coordinates for the original word and its similar words
    x = umap_data[i*21:(i+1)*21, 0]
    y = umap_data[i*21:(i+1)*21, 1]
    # Create a trace for this word
    trace = go.Scatter(
        x = x,
        y = y,
        mode = 'markers',
        name = word,
        text = all_words[i*21:(i+1)*21],  # This will be displayed when a point is hovered over
        marker = dict(
            size = 10,
            line = dict(width = 1)
        )
    )
    traces.append(trace)

# Define the layout
layout = go.Layout(
    title='Word Embeddings',
    xaxis=dict(title='UMAP Dimension 1'),
    yaxis=dict(title='UMAP Dimension 2')
)

# Define the figure
fig = go.Figure(data=traces, layout=layout)

# Render the plot
fig.show()

In [13]:
# Find the most similar words
similar_words = custom_model.wv.most_similar(positive=['machine', 'learning'], negative=['black', 'box'], topn=20)

for word, similarity in similar_words:
    print(word, similarity)

datadriven 0.26629921793937683
intelligence 0.26172393560409546
forecasting 0.2561148405075073
recognition 0.2520157992839813
neural 0.24715037643909454
automatic 0.24380120635032654
multivariate 0.241044819355011
realtime 0.24044063687324524
yuewei 0.2363181710243225
mathematica 0.23421600461006165
ml 0.2318514734506607
engineering 0.22944073379039764
novel 0.22687287628650665
physics 0.22576506435871124
traditional 0.22433049976825714
automation 0.22026363015174866
ness 0.21803973615169525
tried 0.21603770554065704
empirical 0.21584351360797882
optimization 0.21554790437221527
