In [1]:
import numpy as np
import pandas as pd
import selenium
from selenium import webdriver
import bs4
from bs4 import BeautifulSoup
import time
import re
import nltk
from gensim.models import Word2Vec
from collections import defaultdict

In [3]:
# Extraction of abstracts from urls and writing to a single text file
def extractor(url,driver,wait_time):
    """
    Accepts a url and stores its html code before parsing and extracting the title,
    abstract, doi, and the body as text.
    """
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(wait_time) # important

    html_doc = driver.page_source # stores the source HTML code in the driver's page_source attribute
    soup = BeautifulSoup(html_doc, 'html.parser')
    abstract = soup.find('div', {'class':"Abstracts u-font-serif"}).text

    return abstract

def parse_all(max_iters,driver,data):
    """
    The following method is designed to automatically parse each url contained in a long list 
    of scraped urls, and writes the title, abstract, and doi to a new text file with a user
    input "file_name.txt."
    
    Arguments:
    max_iters - total number of scraped urls to be parsed
    driver - desired webdriver
    data - text file containing a list of the scraped urls
    file - the new text file given by the user input
    """
    for i in range(0,max_iters):
        print('On url ',i)
        driver.refresh()
        time.sleep(2)
        urli = str(extractor(data.iloc[i,0],driver,3))
        file.write(urli)
        file.write('\n')
  

driver = webdriver.Chrome()
data = pd.read_csv('corrosion_inhib_2s.txt',header=None,names=['url']) # The local file containing a list of scraped urls
max_iters = len(data)
print("The parser will parse: " + str(max_iters) + " urls.")
file_name = input("Input the file name with .txt extension you wish to store extracted data in: ")
file = open(file_name,'w')

parse_all(max_iters,driver,data)

driver.quit()

The parser will parse: 20 urls.
Input the file name with .txt extension you wish to store extracted data in: sd_abstracts.txt
On url  0
On url  1
On url  2
On url  3
On url  4
On url  5
On url  6
On url  7
On url  8
On url  9
On url  10
On url  11
On url  12
On url  13
On url  14
On url  15
On url  16
On url  17
On url  18
On url  19


In [8]:
# Reformat/cleans text
with open("sd_abstracts.txt") as file:
    corpus = file.readlines()
    processed_abstracts = [w.lower() for w in corpus]
    processed_abstracts = [re.sub('[^a-zA-Z]', ' ', w) for w in processed_abstracts]
    processed_abstracts = [re.sub(r'\s+', ' ', w) for w in processed_abstracts]
    
def tokenize(sentences):
    tokens = [nltk.word_tokenize(sent) for sent in processed_abstracts]
    return tokens

# Removal of unnecessary stopwords, like "of", "and", "the", etc.
from nltk.corpus import stopwords
for i in range(len(processed_abstracts)):
    tokens[i] = [w for w in tokens[i] if w not in stopwords.words('english')]
print(tokens)

[['highlights', 'ppm', 'amp', 'marginal', 'effect', 'corrosion', 'x', 'steel', 'corrosion', 'rate', 'decreased', 'amp', 'concentration', 'increased', 'flow', 'found', 'accelerate', 'propagation', 'pits', 'combination', 'pz', 'amp', 'could', 'inhibit', 'pitting', 'corrosion', 'efficiently', 'amp', 'pz', 'inhibited', 'corrosion', 'owing', 'neutralization', 'adsorption', 'effects', 'abstractthe', 'corrosion', 'steels', 'supercritical', 'co', 'environment', 'impurities', 'carbon', 'capture', 'utilization', 'storage', 'system', 'attracted', 'great', 'interests', 'recent', 'years', 'work', 'corrosion', 'inhibition', 'mechanisms', 'residual', 'amino', 'methyl', 'propanol', 'piperazine', 'co', 'capture', 'process', 'x', 'steel', 'within', 'impure', 'supercritical', 'co', 'environment', 'investigated', 'weight', 'loss', 'method', 'surface', 'analysis', 'techniques', 'results', 'showed', 'small', 'amount', 'amino', 'methyl', 'propanol', 'marginal', 'effect', 'corrosion', 'steel', 'corrosion', 'r

In [14]:
# Passes all tokens to Word2Vec
word2vec = Word2Vec(tokens, window=5, min_count=2, iter=5, negative=5) 
vocabulary = word2vec.wv.vocab

In [15]:
# Computes cosine similarity score between the search term and each token in the text
search_term = ['corrosion','inhibition'] #eventually user input
store = defaultdict(int)
for word in search_term:
    for vocab_word in vocabulary:
        store[vocab_word] += word2vec.wv.similarity(word,vocab_word)

# Orders dictionary from highest to lowest cosine similarity score
cos_scores = sorted(store.items() , reverse=True, key=lambda x: x[1])

# Extracts top 20 most similar tokens
cos_scores[:20]

[('corrosion', 0.9540901556611061),
 ('inhibition', 0.9540901556611061),
 ('show', 0.49105095863342285),
 ('investigated', 0.47183704376220703),
 ('acid', 0.46097661554813385),
 ('nccm', 0.4189692437648773),
 ('mrow', 0.39715880155563354),
 ('studied', 0.38048961758613586),
 ('efficiency', 0.3727501630783081),
 ('better', 0.3636122830212116),
 ('sam', 0.36352553963661194),
 ('performance', 0.3593912348151207),
 ('amine', 0.35909971594810486),
 ('surface', 0.3480038493871689),
 ('electron', 0.3480012118816376),
 ('temperature', 0.34464122354984283),
 ('investigate', 0.3423885554075241),
 ('decrease', 0.34132278710603714),
 ('improved', 0.340078204870224),
 ('true', 0.330965980887413)]