## Creating Word Embedding Models for 4 decades

Computational Literature Review

Creator: Jaren Haber, PhD, Nancy Xu

Date created: February 15, 2022

Date last modified: November 11, 2022

This notebook preprocesses training texts, and creates word2vec embedding models for 4 decades (1970-1979,1980-1989,1990-1999,2000-2015)

## Load datasets

In [42]:
import pickle
import re
from tqdm import tqdm

import os
import random as rand

import numpy as np
import pandas as pd
# from clean_text import stopwords_make, punctstr_make, unicode_make, apache_tokenize, clean_sentence_apache 

import matplotlib.pyplot as plt

import numpy as np
import re
import random

from collections import Counter

## load training data -  these files keep stopwords because Longformer and BERT expect such inputs

# def open_test_data(path):
#     return open(path, 'rb')

# with open_test_data('/home/jovyan/work/models_storage/filtered_preprocessed_texts_65365_022621.pkl') as f:
#     full = pickle.load(f)



Modifies the jstor id to get consistent format.

In [2]:
def modify_jstor_id(x, col_name, https = False):
    '''
    modify jstor id to get the link in the form of www.jstor.org/stable/23057056
    '''
    
    good_parts = []
    if not https:
        for ii in x[col_name]:
            try: 
                good_parts.append(ii.split('http://')[1])
            except:
                good_parts.append(ii)
    else:
        for ii in x[col_name]:
            try: 
                good_parts.append(ii.split('https://')[1])
            except:
                good_parts.append(ii)
        
    return good_parts

Merge with the metadata files with the correct publish dates. 

In [3]:
# modify jstor id's

## combine the data for the correct article dates
dates = pd.read_csv('./sample/parts-1-3-metadata.csv')
date2 = pd.read_csv('./sample/part-4-metadata.csv')
date2.id = modify_jstor_id(date2, 'id')
dates.id = modify_jstor_id(dates,'id')

In [4]:
combo = pd.concat([dates, date2])

In [5]:
root = '/home/jovyan/work/'
meta_fp = root + '/dictionary_methods/code/metadata_combined.h5' 

df_meta = pd.read_hdf(meta_fp)
df_meta.reset_index(drop=False, inplace=True) # extract file name from index

# For merging purposes, get ID alone from file name, e.g. 'journal-article-10.2307_2065002' -> '10.2307_2065002'
df_meta['edited_filename'] = df_meta['file_name'].apply(lambda x: x[16:]) 
df_meta = df_meta[["edited_filename", "article_name", "jstor_url", "abstract", "journal_title", "given_names", "primary_subject", "year", "type"]] # keep only relevant columns

df_meta['id'] =  modify_jstor_id(df_meta,'jstor_url', True)


In [6]:
m = df_meta.merge(combo, on = 'id')

In [7]:
def get_doi(string):
    return string.split('-')[-1][:-4]

In [8]:
full['edited_filename'] = full['file_name'].apply(get_doi)

In [9]:
full.head()

Unnamed: 0,file_name,text,edited_filename
0,/vol_b/data/jstor_data/ocr/journal-article-10....,"[[research, note, church_membership, netherlan...",10.2307_1387034
1,/vol_b/data/jstor_data/ocr/journal-article-10....,"[[polish, io_oo, sociological_review, issn, co...",10.2307_41274754
2,/vol_b/data/jstor_data/ocr/journal-article-10....,"[[article, jjdlbsj, grapliy, compassionate, eg...",10.2307_24467156
3,/vol_b/data/jstor_data/ocr/journal-article-10....,"[[reply, allison, more, comparing, regression_...",10.2307_2782279
4,/vol_b/data/jstor_data/ocr/journal-article-10....,"[[determinants, spousal, interaction, marital,...",10.2307_351656


In [10]:
m.head()

Unnamed: 0,edited_filename,article_name,jstor_url,abstract,journal_title,given_names,primary_subject,year,type,id,...,url,creator,publisher,language,pageStart,pageEnd,placeOfPublication,wordCount,pageCount,file
0,10.2307_351312,Sex-Role Congruency and Marital Quality,https://www.jstor.org/stable/351312,Drawing upon a probability sample of 331 milit...,Journal of Marriage and Family,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,1976,research-article,www.jstor.org/stable/351312,...,http://www.jstor.org/stable/351312,Gary Lee Bowen; Dennis K. Orthner,Wiley,eng,223,230,,5495,8,part-2.jsonl.gz
1,10.2307_1171381,"Bosses, Machines, and Democratic Leadership: P...",https://www.jstor.org/stable/1171381,,Social Science History,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,1986,research-article,www.jstor.org/stable/1171381,...,http://www.jstor.org/stable/1171381,Philip R. Vandermeer,Cambridge University Press,eng,395,428,,12315,34,part-1.jsonl.gz
2,10.2307_20832283,"RECOGNIZING GENDER BIAS, REJECTING FEMINISM: A...",https://www.jstor.org/stable/20832283,This article explores the degree to which cler...,Sociological Focus,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,2006,research-article,www.jstor.org/stable/20832283,...,http://www.jstor.org/stable/20832283,SUSAN R. CODY,"Taylor & Francis, Ltd.",eng,37,53,,9241,17,part-1.jsonl.gz
3,10.2307_2096207,Survival Chances of Newly Founded Business Org...,https://www.jstor.org/stable/2096207,Human capital theory and organizational ecolog...,American Sociological Review,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,1991,research-article,www.jstor.org/stable/2096207,...,http://www.jstor.org/stable/2096207,Josef Brüderl; Peter Preisendörfer; Rolf Ziegler,American Sociological Association,eng,227,242,,10467,16,part-1.jsonl.gz
4,10.2307_2391724,Dimensions of Organizational Influence and The...,https://www.jstor.org/stable/2391724,"In this study participativeness, centralizatio...",Administrative Science Quarterly,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Management & Organizational Behavior,1971,research-article,www.jstor.org/stable/2391724,...,http://www.jstor.org/stable/2391724,Johannes M. Pennings,"Sage Publications, Inc.",eng,688,699,,6022,12,part-1.jsonl.gz


In [11]:
mm_full = full.merge(m, on = 'edited_filename', how = 'left')[['text', 'edited_filename', 'journal_title', 'publicationYear']]
mm_full = mm_full[~mm_full['publicationYear'].isna()]

In [12]:
mm_full.head()

Unnamed: 0,text,edited_filename,journal_title,publicationYear
0,"[[research, note, church_membership, netherlan...",10.2307_1387034,Journal for the Scientific Study of Religion,1990.0
1,"[[polish, io_oo, sociological_review, issn, co...",10.2307_41274754,Polish Sociological Review,2000.0
2,"[[article, jjdlbsj, grapliy, compassionate, eg...",10.2307_24467156,Ethnography,2014.0
3,"[[reply, allison, more, comparing, regression_...",10.2307_2782279,American Journal of Sociology,1995.0
4,"[[determinants, spousal, interaction, marital,...",10.2307_351656,Journal of Marriage and Family,1983.0


## Remove HTML

remove html tags in unstructured text data.

In [13]:
import itertools


def get_full_text(text):
    full_text=[]
    for i in text:
        joined = list(itertools.chain(*i))
        full_text.append(" ".join(joined))
    return full_text

mm_full['full_text'] = get_full_text(mm_full['text'])



def remove_tags(article):
    article = re.sub('<plain_text> <page sequence="1">', '', article)
    article = re.sub(r'</page>(\<.*?\>)', ' \n ', article)
    # xml tags
    article = re.sub(r'<.*?>', '', article)
    article = re.sub(r'<body.*\n\s*.*\s*.*>', '', article)
    return article

mm_full['text_no_tags'] = mm_full['full_text'].apply(remove_tags)


## Remove stop words, punctuations, lemmatize

In [14]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 5.3 MB/s eta 0:00:01
Collecting regex>=2021.8.3
  Downloading regex-2022.9.13-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (769 kB)
[K     |████████████████████████████████| 769 kB 100.7 MB/s eta 0:00:01
[?25hInstalling collected packages: regex, nltk
Successfully installed nltk-3.7 regex-2022.9.13


In [15]:
import nltk
nltk.download('words')

[nltk_data] Downloading package words to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [17]:
from nltk.corpus import stopwords
import datetime

In [18]:
# Prep dictionaries of English words
from nltk.corpus import words # Dictionary of 236K English words from NLTK
english_nltk = set(words.words()) # Make callable
english_long = set() # Dictionary of 467K English words from https://github.com/dwyl/english-words
# fname =  "english_words.txt" # Set file path to long english dictionary
# with open(fname, "r") as f:
#     for word in f:
#         english_long.add(word.strip())
        
def stopwords_make(vocab_path_old = "", extend_stopwords = False):
    """Create stopwords list. 
    If extend_stopwords is True, create larger stopword list by joining sklearn list to NLTK list."""
                                                     
    stop_word_list = list(set(stopwords.words("english"))) # list of english stopwords

    # Add dates to stopwords
    for i in range(1,13):
        stop_word_list.append(datetime.date(2008, i, 1).strftime('%B'))
    for i in range(1,13):
        stop_word_list.append((datetime.date(2008, i, 1).strftime('%B')).lower())
    for i in range(1, 2100):
        stop_word_list.append(str(i))

    # Add other common stopwords
    stop_word_list.append('00') 
    stop_word_list.extend(['mr', 'mrs', 'sa', 'fax', 'email', 'phone', 'am', 'pm', 'org', 'com', 
                           'Menu', 'Contact Us', 'Facebook', 'Calendar', 'Lunch', 'Breakfast', 
                           'facebook', 'FAQs', 'FAQ', 'faq', 'faqs']) # web stopwords
    stop_word_list.extend(['el', 'en', 'la', 'los', 'para', 'las', 'san']) # Spanish stopwords
    stop_word_list.extend(['angeles', 'diego', 'harlem', 'bronx', 'austin', 'antonio']) # cities with many charter schools

    # Add state names & abbreviations (both uppercase and lowercase) to stopwords
    states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 
              'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 
              'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 
              'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 
              'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WI', 'WV', 'WY', 
              'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 
              'Colorado', 'Connecticut', 'District of Columbia', 'Delaware', 'Florida', 
              'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 
              'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 
              'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 
              'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 
              'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 
              'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 
              'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 
              'Vermont', 'Virginia', 'Washington', 'Wisconsin', 'West Virginia', 'Wyoming' 
              'carolina', 'columbia', 'dakota', 'hampshire', 'mexico', 'rhode', 'york']
    for state in states:
        stop_word_list.append(state)
    for state in [state.lower() for state in states]:
        stop_word_list.append(state)
        
    # Add even more stop words:
    if extend_stopwords == True:
        stop_word_list = text.ENGLISH_STOP_WORDS.union(stop_word_list)
        
    # If path to old vocab not specified, skip last step and return stop word list thus far
    if vocab_path_old == "":
        return stop_word_list

    # Add to stopwords useless and hard-to-formalize words/chars from first chunk of previous model vocab (e.g., a3d0, \fs19)
    # First create whitelist of useful terms probably in that list, explicitly exclude from junk words list both these and words with underscores (common phrases)
    whitelist = ["Pre-K", "pre-k", "pre-K", "preK", "prek", 
                 "1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th", "11th", "12th", 
                 "1st-grade", "2nd-grade", "3rd-grade", "4th-grade", "5th-grade", "6th-grade", 
                 "7th-grade", "8th-grade", "9th-grade", "10th-grade", "11th-grade", "12th-grade", 
                 "1st-grader", "2nd-grader", "3rd-grader", "4th-grader", "5th-grader", "6th-grader", 
                 "7th-grader", "8th-grader", "9th-grader", "10th-grader", "11th-grader", "12th-grader", 
                 "1stgrade", "2ndgrade", "3rdgrade", "4thgrade", "5thgrade", "6thgrade", 
                 "7thgrade", "8thgrade", "9thgrade", "10thgrade", "11thgrade", "12thgrade", 
                 "1stgrader", "2ndgrader", "3rdgrader", "4thgrader", "5thgrader", "6thgrader", 
                 "7thgrader", "8thgrader", "9thgrader", "10thgrader", "11thgrader", "12thgrader"]
    with open(vocab_path_old) as f: # Load vocab from previous model
        junk_words = f.read().splitlines() 
    junk_words = [word for word in junk_words[:8511] if ((not "_" in word) 
                                                         and (not any(term in word for term in whitelist)))]
    stop_word_list.extend(junk_words)
                                                     
    return stop_word_list

In [20]:
stop_words = stopwords_make(vocab_path_old = "", extend_stopwords = False)

In [21]:
import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
# stop_words = stopwords.words('english')
# stop_words.extend(['one', 'two', 'three', 'amp', 'may', 'can', 'new', 'also', 'and'])

import string
import re
import nltk

def word_process(tt):
    """
    helper function to lower text, remove stop words, numbers, and empty strings 
    """
    
    
    
    tt = tt.lower()
    
    punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~=\n'''
    # Removing punctuations in string 
    # Using loop + punctuation string 


    for ele in tt:  
        if ele in punc:  
            tt = tt.replace(ele, " ")  

    # read tokens
    tokens = tt.split()
    lst = [token.translate(punc).lower() for token in tokens ]
    
    #remove stop words
    filtered = []
    for i in lst:
        if i not in stop_words:
            filtered.append(i)
    
    # removing singular numbers and singular letters
    pattern = '[0-9]'
    filtered = [re.sub(pattern, '', i) for i in filtered] 
    new = []
    for inp in filtered:
        new.append(' '.join( [w for w in inp.split() if len(w)>1] ))
        
    # filter out empty strings 
    new = [i for i in new if i] 

    dt = [d.split() for d in new]
    
    return dt


In [22]:
from tqdm import tqdm
tqdm.pandas()

In [23]:
mm_full['processed'] =  mm_full['text_no_tags'].progress_apply(word_process)


mm_full['processed'] = [sum(i, []) for i in mm_full['processed']]

combo_train_df = mm_full

100%|██████████| 63038/63038 [2:36:12<00:00,  6.73it/s]   


## Split datasets into decades & save

split the dataset into 4 decades.

In [24]:
first_decade = combo_train_df[combo_train_df['publicationYear'] <= 1979]
second_decade = combo_train_df[(combo_train_df['publicationYear'] >= 1980) & (combo_train_df['publicationYear'] <= 1989) ]

third_decade = combo_train_df[(combo_train_df['publicationYear'] >= 1990) & (combo_train_df['publicationYear'] <= 1999) ]


fourth_decade = combo_train_df[(combo_train_df['publicationYear'] >= 2000) & (combo_train_df['publicationYear'] <= 2015) ]


first_decade.to_csv('first_decade.csv')
second_decade.to_csv('second_decade.csv')
third_decade.to_csv('third_decade.csv')
fourth_decade.to_csv('fourth_decade.csv')

## Preprocess text for each decade

### Remove surnames, use enchant library to filter out non-English words

In [25]:
os.getcwd()

'/home/jovyan/work/nancyxu'

In [26]:
first_decade = pd.read_csv('first_decade.csv')
second_decade = pd.read_csv('second_decade.csv')
third_decade = pd.read_csv('third_decade.csv')
fourth_decade = pd.read_csv('fourth_decade.csv')

In [27]:
import pandas as pd
import ast
import pickle

In [28]:
## remove surnames

my_file = open("surnames.txt", "r")
data = my_file.read()
surnames = data.split(",")
surnames = [i.replace("'", '').strip() for i in surnames]
my_file.close()

In [29]:
! pip install pyenchant

Collecting pyenchant
  Downloading pyenchant-3.2.2-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.9 MB/s eta 0:00:011
[?25hInstalling collected packages: pyenchant
Successfully installed pyenchant-3.2.2


In [30]:
import enchant
valid_d = enchant.Dict("en_US") 

In [31]:
def process_for_decade(decade_df, decade):
    """
    remove surnames, filter by enchant dictionary, and filter by word length
    """
    
    p = [ast.literal_eval(i) for i in decade_df['processed']]
    
    processed_again = []

    for i in tqdm(p):
        k = [j for j in i if j not in surnames]
        processed_again.append(k)
        
    processed_twice = []
    for i in tqdm(processed_again):
        k = [el for el in i if el.isalpha() and valid_d.check(el)]
        processed_twice.append(k)
    
#     for k in processed_twice:
#         processed_again2.append([i for i in k if len(i)>2])
    
    with open('processed_corp_enchant_' + decade +'.pkl', 'wb') as f:
        pickle.dump(processed_twice, f)
    
    return processed_twice

In [None]:
import pickle
with open('processed_corp_enchant.pkl', 'rb') as f:
    processed_first_decade = pickle.load(f)

In [None]:
import pickle
with open('processed_corp_enchant.pkl', 'rb') as f:
    processed_second_decade = pickle.load(f)

In [None]:
import pickle
with open('processed_corp_enchant.pkl', 'rb') as f:
    processed_second_decade = pickle.load(f)

In [33]:
processed_first_decade = process_for_decade(first_decade, 'first')

100%|██████████| 6677/6677 [2:13:59<00:00,  1.20s/it]  
100%|██████████| 6677/6677 [02:37<00:00, 42.34it/s]


In [34]:
processed_second_decade = process_for_decade(second_decade, 'second')


100%|██████████| 12222/12222 [4:39:49<00:00,  1.37s/it]  
100%|██████████| 12222/12222 [05:37<00:00, 36.26it/s]


In [None]:
processed_third_decade = process_for_decade(third_decade, 'third')


 60%|██████    | 9546/15832 [4:28:52<3:42:08,  2.12s/it] 

In [None]:
processed_fourth_decade = process_for_decade(fourth_decade, 'fourth')

## Train gensim phrased word2vec models

In [None]:
!pip install gensim

In [43]:
from gensim.models.phrases import Phrases

build bigrams to create phased w2v models.

In [44]:

from gensim.models.phrases import Phrases, Phraser
def build_phrases(sentences):
    phrases = Phrases(sentences, min_count = 5, threshold = 7, progress_per = 1000)
    return Phraser(phrases)

In [45]:
bigram1 = build_phrases(processed_first_decade)
# bigram2 = build_phrases(processed_second_decade)
# bigram3 = build_phrases(processed_third_decade)
# bigram4 = build_phrases(processed_fourth_decade)

In [46]:
processed_bigrams1 = [bigram1[i] for i in tqdm(processed_first_decade)]
# processed_bigrams2 = [bigram2[i] for i in processed_second_decade]
# processed_bigrams3 = [bigram3[i] for i in processed_third_decade]
# processed_bigrams4 = [bigram4[i] for i in processed_fourth_decade]

100%|██████████| 6677/6677 [00:20<00:00, 323.47it/s]


In [47]:
bigram2 = build_phrases(processed_second_decade)
bigram3 = build_phrases(processed_third_decade)
bigram4 = build_phrases(processed_fourth_decade)

In [48]:
processed_bigrams2 = [bigram2[i] for i in processed_second_decade]
processed_bigrams3 = [bigram3[i] for i in processed_third_decade]
processed_bigrams4 = [bigram4[i] for i in processed_fourth_decade]

In [49]:
processed_bigrams_final1 = []
processed_bigrams_final2 = []
processed_bigrams_final3 = []
processed_bigrams_final4 = []

## strip punctuations
for k in processed_bigrams1:
    processed_bigrams_final1.append([i.strip('!"“#$%&\'()*+,-./:;<=>?@[\\]^`{|}~…') for i in k if len(i)>2])

for k in processed_bigrams2:
    processed_bigrams_final2.append([i.strip('!"“#$%&\'()*+,-./:;<=>?@[\\]^`{|}~…') for i in k if len(i)>2])

for k in processed_bigrams3:
    processed_bigrams_final3.append([i.strip('!"“#$%&\'()*+,-./:;<=>?@[\\]^`{|}~…') for i in k if len(i)>2])

for k in processed_bigrams4:
    processed_bigrams_final4.append([i.strip('!"“#$%&\'()*+,-./:;<=>?@[\\]^`{|}~…') for i in k if len(i)>2])

In [50]:
import multiprocessing
from sklearn import utils
cores = multiprocessing.cpu_count()
import gensim
from gensim.test.utils import get_tmpfile

create one model for each decade.

In [51]:
model_decade_1 = gensim.models.Word2Vec(processed_bigrams_final1, vector_size = 300, window = 10,
                                       min_count = 5, sg = 1, alpha = 0.05, epochs = 50, 
                                       batch_words = 10000, workers = cores, seed = 0, negative = 5,ns_exponent = 0.75)



fname = "word2vec_phrased_filtered_enchant_300d_1970_1979_2022_oct30.bin"
model_decade_1.save(fname)
print("Model Saved!")

Model Saved!


In [52]:
model_decade_2 = gensim.models.Word2Vec(processed_bigrams_final2, vector_size = 300, window = 10,
                                       min_count = 5, sg = 1, alpha = 0.05, epochs = 50, 
                                       batch_words = 10000, workers = cores, seed = 0, negative = 5,ns_exponent = 0.75)



fname = "word2vec_phrased_filtered_enchant_300d_1980_1989_2022_oct30.bin"
model_decade_2.save(fname)
print("Model Saved!")

Model Saved!


In [53]:
model_decade_3 = gensim.models.Word2Vec(processed_bigrams_final3, vector_size = 300, window = 10,
                                       min_count = 5, sg = 1, alpha = 0.05, epochs = 50, 
                                       batch_words = 10000, workers = cores, seed = 0, negative = 5,ns_exponent = 0.75)



fname = "word2vec_phrased_filtered_enchant_300d_1990_1999_2022_oct30.bin"
model_decade_3.save(fname)
print("Model Saved!")

Model Saved!


In [54]:
model_decade_4 = gensim.models.Word2Vec(processed_bigrams_final4, vector_size = 300, window = 10,
                                       min_count = 5, sg = 1, alpha = 0.05, epochs = 50, 
                                       batch_words = 10000, workers = cores, seed = 0, negative = 5,ns_exponent = 0.75)



fname = "word2vec_phrased_filtered_enchant_300d_2000_2016_2022_oct30.bin"
model_decade_4.save(fname)
print("Model Saved!")

Model Saved!
