In [1]:
import numpy as np
import pandas as pd
import selenium
from selenium import webdriver
import bs4
from bs4 import BeautifulSoup
import time
import re
import nltk
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from collections import defaultdict

In [3]:
# Extraction of abstracts from urls and writing to a single text file
def extractor(url,driver,wait_time):
    """
    Accepts a url and stores its html code before parsing and extracting the title,
    abstract, doi, and the body as text.
    """
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(wait_time) # important

    html_doc = driver.page_source # stores the source HTML code in the driver's page_source attribute
    soup = BeautifulSoup(html_doc, 'html.parser')
    abstract = soup.find('div', {'class':"Abstracts u-font-serif"}).text

    return abstract

def parse_all(max_iters,driver,data):
    """
    The following method is designed to automatically parse each url contained in a long list 
    of scraped urls, and writes the title, abstract, and doi to a new text file with a user
    input "file_name.txt."
    
    Arguments:
    max_iters - total number of scraped urls to be parsed
    driver - desired webdriver
    data - text file containing a list of the scraped urls
    file - the new text file given by the user input
    """
    for i in range(0,max_iters):
        print('On url ',i)
        driver.refresh()
        time.sleep(2)
        urli = str(extractor(data.iloc[i,0],driver,3))
        file.write(urli)
        file.write('\n')
  

driver = webdriver.Chrome()
data = pd.read_csv('corrosion_inhib_2s.txt',header=None,names=['url']) # The local file containing a list of scraped urls
max_iters = len(data)
print("The parser will parse: " + str(max_iters) + " urls.")
file_name = input("Input the file name with .txt extension you wish to store extracted data in: ")
file = open(file_name,'w')

parse_all(max_iters,driver,data)

driver.quit()

The parser will parse: 20 urls.
Input the file name with .txt extension you wish to store extracted data in: sd_abstracts.txt
On url  0
On url  1
On url  2
On url  3
On url  4
On url  5
On url  6
On url  7
On url  8
On url  9
On url  10
On url  11
On url  12
On url  13
On url  14
On url  15
On url  16
On url  17
On url  18
On url  19


In [3]:
# Reformat/cleans text
with open("562_corrosion_abstracts.txt") as file:
    corpus = file.readlines()
    processed_abstracts = [w.lower() for w in corpus]
    processed_abstracts = [re.sub('[^a-zA-Z]', ' ', w) for w in processed_abstracts]
    processed_abstracts = [re.sub(r'\s+', ' ', w) for w in processed_abstracts]
    
tokens = [nltk.word_tokenize(sent) for sent in processed_abstracts]

# Removal of unnecessary stopwords, like "of", "and", "the", etc.
from nltk.corpus import stopwords
for i in range(len(processed_abstracts)):
    tokens[i] = [w for w in tokens[i] if w not in stopwords.words('english')]
print(tokens)

[['abstractinvestigations', 'carried', 'properties', 'coatings', 'differing', 'pigmentation', 'binder', 'applied', 'different', 'chemical', 'pre', 'treatments', 'steel', 'surface', 'paints', 'based', 'alkyd', 'alkyd', 'melamine', 'binders', 'pigmented', 'zinc', 'phosphate', 'modified', 'basic', 'zinc', 'phosphate', 'applied', 'amorphous', 'crystalline', 'phosphated', 'steel', 'surface', 'comparison', 'purpose', 'degreased', 'steel', 'surface', 'effect', 'binder', 'pigment', 'pre', 'treatment', 'steel', 'surface', 'protective', 'properties', 'coatings', 'determined', 'measurements', 'adhesion', 'water', 'absorption', 'water', 'permeability', 'results', 'obtained', 'salt', 'spray', 'prohesion', 'tests', 'coatings', 'based', 'alkyd', 'binder', 'show', 'lower', 'damage', 'degree', 'good', 'retention', 'adhesion', 'corrosion', 'conditions', 'spite', 'higher', 'water', 'absorption', 'water', 'permeability', 'lower', 'initial', 'adhesive', 'strength', 'protective', 'properties', 'coatings', '

In [11]:
# Passes all tokens to Word2Vec
model = Word2Vec(tokens, size=100, min_count=2, iter=10) 
vocabulary = model.wv.vocab

In [12]:
# Computes cosine similarity score between the search term and each token in the vocabulary
search_term = ['corrosion','inhibition'] #eventually user input
store = defaultdict(int)
for word in search_term:
    for vocab_word in vocabulary:
        store[vocab_word] += model.wv.similarity(word,vocab_word)

# Orders dictionary from highest to lowest cosine similarity score
cos_scores = sorted(store.items() , reverse=True, key=lambda x: x[1])

In [14]:
# Test comparing one abstract to trained word2vec
testurl = 'https://www-sciencedirect-com.offcampus.lib.washington.edu/science/article/pii/S0300944005000809'
driver = webdriver.Chrome()
driver.get(testurl)
time.sleep(3)

html_doc2 = driver.page_source # stores the source HTML code in the driver's page_source attribute
soup = BeautifulSoup(html_doc2, 'html.parser')
target_abstract = soup.find('div', {'class':"Abstracts u-font-serif"}).text

test_abstract = target_abstract.lower()
test_abstract = re.sub('[^a-zA-Z]', ' ', test_abstract) 
test_abstract = re.sub(r'\s+', ' ', test_abstract)
    
abstract_tokens = nltk.word_tokenize(test_abstract)

final_abstract_tokens = [tkn for tkn in abstract_tokens if tkn not in stopwords.words('english')]
print(final_abstract_tokens)

['abstractinvestigations', 'carried', 'properties', 'coatings', 'differing', 'pigmentation', 'binder', 'applied', 'different', 'chemical', 'pre', 'treatments', 'steel', 'surface', 'paints', 'based', 'alkyd', 'alkyd', 'melamine', 'binders', 'pigmented', 'zinc', 'phosphate', 'modified', 'basic', 'zinc', 'phosphate', 'applied', 'amorphous', 'crystalline', 'phosphated', 'steel', 'surface', 'comparison', 'purpose', 'degreased', 'steel', 'surface', 'effect', 'binder', 'pigment', 'pre', 'treatment', 'steel', 'surface', 'protective', 'properties', 'coatings', 'determined', 'measurements', 'adhesion', 'water', 'absorption', 'water', 'permeability', 'results', 'obtained', 'salt', 'spray', 'prohesion', 'tests', 'coatings', 'based', 'alkyd', 'binder', 'show', 'lower', 'damage', 'degree', 'good', 'retention', 'adhesion', 'corrosion', 'conditions', 'spite', 'higher', 'water', 'absorption', 'water', 'permeability', 'lower', 'initial', 'adhesive', 'strength', 'protective', 'properties', 'coatings', 'f

In [15]:
search_term = ['corrosion','inhibition']
store = defaultdict(int)

for word in search_term:
    for tkn in final_abstract_tokens:
        store[tkn] += model.wv.similarity(word,tkn)
# Orders dictionary from highest to lowest cosine similarity score
cos_scores = sorted(store.items() , reverse=True, key=lambda x: x[1])
# Extracts top 20 most similar tokens
cos_scores[:20]

[('steel', 9.94486153125763),
 ('phosphate', 7.890325486660004),
 ('surface', 7.579369068145752),
 ('coatings', 7.461742877960205),
 ('properties', 6.5272040367126465),
 ('pre', 6.478586912155151),
 ('protective', 5.587923288345337),
 ('zinc', 5.472507119178772),
 ('binder', 5.123452126979828),
 ('alkyd', 4.3048460483551025),
 ('crystalline', 4.157286465167999),
 ('pigment', 3.448456048965454),
 ('substrate', 3.1605554819107056),
 ('chemical', 3.12239933013916),
 ('adhesion', 3.046516180038452),
 ('modified', 2.9222697019577026),
 ('lower', 2.896170735359192),
 ('pigmented', 2.846609115600586),
 ('applied', 2.841576099395752),
 ('permeability', 2.8201870918273926)]