## Creating Word Embedding Models for 4 decades

Computational Literature Review

Creator: Jaren Haber, PhD, Nancy Xu

Date created: February 15, 2022

Date last modified: November 11, 2022

This notebook preprocesses training texts, and creates word2vec embedding models for 4 decades (1970-1979,1980-1989,1990-1999,2000-2015)

## Load datasets

In [1]:
import pickle
import re
from tqdm.notebook import trange, tqdm
tqdm.pandas()

import os
import random as rand

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import numpy as np
import re
import random

from collections import Counter

## Test orgs filter

In [2]:
orgs_dict = pd.read_csv('../../dictionary_methods/dictionaries/core/orgs.csv', delimiter = '\n', 
                        header=None)[0].tolist()
orgs_dict

['organization',
 'organizational',
 'organizations',
 'firm',
 'firms',
 'association',
 'associations',
 'employer',
 'employing',
 'employment',
 'bureaucracy',
 'bureaucracies',
 'bureaucratic',
 'office',
 'offices',
 'bureau',
 'bureaus',
 'department',
 'departments',
 'departmental',
 'subunit',
 'subunits']

In [3]:
def read_text(file_path, return_string = True, shell = False):
    """Loads text into memory, either as str or as list. Must be assigned to object.
    
    Args: 
        file_path: Path to file (str)
        return_string: boolean indicating whether to return as string format (instead of list)
        shell: boolean indicating if function is called from command line
    
    Returns: 
        str if return_string, else list
    """
    
    if shell: 
        
        with open(file_path, 'r') as file_handler:
            text = file_handler.read()
        
        return text
    
    if return_string:
        
        textstr = '' # empty string
        
        with open(file_path) as file_handler:
            line = file_handler.readline()
            while line:
                textstr += line
                line = file_handler.readline()

        return textstr
        
    else: # return list of text
        
        textlist = [] # empty list
    
        with open(file_path) as file_handler:
            line = file_handler.readline()
            while line:
                textlist.append(line)
                line = file_handler.readline()

        return textlist

In [4]:
# Load raw texts
article_paths_fp = '../../classification/data/filtered_length_article_paths.csv' # List of article file paths

cwd = os.getcwd()
root = str.replace(cwd, 'embeddings/word2vec', '')

# Read full list of articles for new sample selection
tqdm.pandas(desc='Correcting file paths')
#print('Correcting file paths...')
articles = (pd.read_csv(article_paths_fp, low_memory=False, header=None, names=['file_name']))
articles['file_name'] = articles['file_name'].progress_apply(lambda fp: re.sub('/home/jovyan/work/', root, fp))
articles['edited_filename'] = articles['file_name'].apply(lambda fname: fname.split('-')[-1][:-4])

# Read text data from files
tqdm.pandas(desc='Loading ALL text files')
#print('Loading text files...')
articles['text'] = articles['file_name'].progress_apply(lambda fp: read_text(fp, shell = True))

Correcting file paths:   0%|          | 0/65365 [00:00<?, ?it/s]

Loading ALL text files:   0%|          | 0/65365 [00:00<?, ?it/s]

In [5]:
print(articles.shape)
articles.head(10)

(65365, 3)


Unnamed: 0,file_name,edited_filename,text
0,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_1387034,"<plain_text><page sequence=""1"">Research Note C..."
1,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_41274754,"<plain_text><page sequence=""1"">polish 2(i3o),o..."
2,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_24467156,"<plain_text><page sequence=""1"">Article ■jjDlBS..."
3,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_2782279,"<plain_text><page sequence=""1"">REPLY TO ALLISO..."
4,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_351656,"<plain_text><page sequence=""1"">Determinants of..."
5,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_41064725,"<plain_text><page sequence=""1"">wSÊ ■ IH OMPANY..."
6,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_4122891,"<plain_text><page sequence=""1"">ANDREW CHRISTEN..."
7,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_3053111,"<plain_text><page sequence=""1"">LAWYERS AND CON..."
8,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_3005887,"<plain_text><page sequence=""1"">Establishing a ..."
9,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_20832073,"<plain_text><page sequence=""1"">GUESS WHO'S COM..."


In [6]:
sample_df = articles.sample(500)
sample_df.shape

(500, 3)

In [7]:
orgsfilter = sample_df['text'].apply(lambda text: any(term in text for term in orgs_dict))
orgsfilter.value_counts()

True     477
False     23
Name: text, dtype: int64

In [8]:
sample_filtered_df = sample_df[orgsfilter]
print(sample_filtered_df.shape)
sample_filtered_df.head(10)

(477, 3)


Unnamed: 0,file_name,edited_filename,text
38579,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_40586778,"<plain_text><page sequence=""1"">THROUGH A GLASS..."
24153,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_23619174,"<plain_text><page sequence=""1"">CASTE, AGRARIAN..."
22685,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_3518142,"<plain_text><page sequence=""1"">KUMKUM SANGARI*..."
54535,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_20486961,"<plain_text><page sequence=""1"">Gender, Marital..."
40189,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_greemanainte.52.47,"<plain_text> <page sequence=""1""> gmi52smith.qx..."
19268,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_2486702,"<plain_text><page sequence=""1"">Strategic Manag..."
13768,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_40970594,"<plain_text><page sequence=""1"">A New Look at O..."
12780,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_42863683,"<plain_text><page sequence=""1"">Risk Perception..."
50939,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_2657290,"<plain_text><page sequence=""1"">TRADE GLOBALIZA..."
57885,/home/jovyan/work/jstor_data/ocr/journal-artic...,10.2307_40971814,"<plain_text><page sequence=""1"">Yu Jianrong Con..."


## Merge datasets

Modifies the jstor id to get consistent format.

In [9]:
def modify_jstor_id(x, col_name, https = False):
    '''
    modify jstor id to get the link in the form of www.jstor.org/stable/23057056
    '''
    
    good_parts = []
    if not https:
        for ii in x[col_name]:
            try: 
                good_parts.append(ii.split('http://')[1])
            except:
                good_parts.append(ii)
    else:
        for ii in x[col_name]:
            try: 
                good_parts.append(ii.split('https://')[1])
            except:
                good_parts.append(ii)
        
    return good_parts

Merge with the metadata files with the correct publish dates. 

In [10]:
# modify jstor id's

## combine the data for the correct article dates
dates = pd.read_csv('../../models_storage/preprocessed_texts/parts-1-3-metadata.csv')
date2 = pd.read_csv('../../models_storage/preprocessed_texts/part-4-metadata.csv')
date2.id = modify_jstor_id(date2, 'id')
dates.id = modify_jstor_id(dates,'id')

In [11]:
combo = pd.concat([dates, date2])

In [12]:
root = '/home/jovyan/work/'
meta_fp = root + '/dictionary_methods/code/metadata_combined.h5' 

df_meta = pd.read_hdf(meta_fp)
df_meta.reset_index(drop=False, inplace=True) # extract file name from index

# For merging purposes, get ID alone from file name, e.g. 'journal-article-10.2307_2065002' -> '10.2307_2065002'
df_meta['edited_filename'] = df_meta['file_name'].apply(lambda x: x[16:]) 
df_meta = df_meta[["edited_filename", "article_name", "jstor_url", "abstract", "journal_title", "given_names", "primary_subject", "year", "type"]] # keep only relevant columns

df_meta['id'] =  modify_jstor_id(df_meta,'jstor_url', True)


In [13]:
m = df_meta.merge(combo, on = 'id')

In [14]:
def get_doi(string):
    return string.split('-')[-1][:-4]

In [15]:
## load training data -  these files keep stopwords because Longformer and BERT expect such inputs

def open_test_data(path):
    return open(path, 'rb')

with open_test_data('/home/jovyan/work/models_storage/filtered_preprocessed_texts_65365_022621.pkl') as f:
    full = pickle.load(f)

In [16]:
full['edited_filename'] = full['file_name'].apply(get_doi)

In [17]:
full.head()

Unnamed: 0,file_name,text,edited_filename
0,/vol_b/data/jstor_data/ocr/journal-article-10....,"[[research, note, church_membership, netherlan...",10.2307_1387034
1,/vol_b/data/jstor_data/ocr/journal-article-10....,"[[polish, io_oo, sociological_review, issn, co...",10.2307_41274754
2,/vol_b/data/jstor_data/ocr/journal-article-10....,"[[article, jjdlbsj, grapliy, compassionate, eg...",10.2307_24467156
3,/vol_b/data/jstor_data/ocr/journal-article-10....,"[[reply, allison, more, comparing, regression_...",10.2307_2782279
4,/vol_b/data/jstor_data/ocr/journal-article-10....,"[[determinants, spousal, interaction, marital,...",10.2307_351656


In [18]:
m.head()

Unnamed: 0,edited_filename,article_name,jstor_url,abstract,journal_title,given_names,primary_subject,year,type,id,...,url,creator,publisher,language,pageStart,pageEnd,placeOfPublication,wordCount,pageCount,file
0,10.2307_351312,Sex-Role Congruency and Marital Quality,https://www.jstor.org/stable/351312,Drawing upon a probability sample of 331 milit...,Journal of Marriage and Family,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,1976,research-article,www.jstor.org/stable/351312,...,http://www.jstor.org/stable/351312,Gary Lee Bowen; Dennis K. Orthner,Wiley,eng,223,230,,5495,8,part-2.jsonl.gz
1,10.2307_1171381,"Bosses, Machines, and Democratic Leadership: P...",https://www.jstor.org/stable/1171381,,Social Science History,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,1986,research-article,www.jstor.org/stable/1171381,...,http://www.jstor.org/stable/1171381,Philip R. Vandermeer,Cambridge University Press,eng,395,428,,12315,34,part-1.jsonl.gz
2,10.2307_20832283,"RECOGNIZING GENDER BIAS, REJECTING FEMINISM: A...",https://www.jstor.org/stable/20832283,This article explores the degree to which cler...,Sociological Focus,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,2006,research-article,www.jstor.org/stable/20832283,...,http://www.jstor.org/stable/20832283,SUSAN R. CODY,"Taylor & Francis, Ltd.",eng,37,53,,9241,17,part-1.jsonl.gz
3,10.2307_2096207,Survival Chances of Newly Founded Business Org...,https://www.jstor.org/stable/2096207,Human capital theory and organizational ecolog...,American Sociological Review,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Sociology,1991,research-article,www.jstor.org/stable/2096207,...,http://www.jstor.org/stable/2096207,Josef Brüderl; Peter Preisendörfer; Rolf Ziegler,American Sociological Association,eng,227,242,,10467,16,part-1.jsonl.gz
4,10.2307_2391724,Dimensions of Organizational Influence and The...,https://www.jstor.org/stable/2391724,"In this study participativeness, centralizatio...",Administrative Science Quarterly,"[Sidney, Hyman P., Riv-Ellen, Stephen, Thomas,...",Management & Organizational Behavior,1971,research-article,www.jstor.org/stable/2391724,...,http://www.jstor.org/stable/2391724,Johannes M. Pennings,"Sage Publications, Inc.",eng,688,699,,6022,12,part-1.jsonl.gz


In [19]:
mm_full = full.merge(m, on = 'edited_filename', how = 'left')[['text', 'edited_filename', 'journal_title', 'publicationYear']]
mm_full = mm_full[~mm_full['publicationYear'].isna()]

In [20]:
mm_full.head()

Unnamed: 0,text,edited_filename,journal_title,publicationYear
0,"[[research, note, church_membership, netherlan...",10.2307_1387034,Journal for the Scientific Study of Religion,1990.0
1,"[[polish, io_oo, sociological_review, issn, co...",10.2307_41274754,Polish Sociological Review,2000.0
2,"[[article, jjdlbsj, grapliy, compassionate, eg...",10.2307_24467156,Ethnography,2014.0
3,"[[reply, allison, more, comparing, regression_...",10.2307_2782279,American Journal of Sociology,1995.0
4,"[[determinants, spousal, interaction, marital,...",10.2307_351656,Journal of Marriage and Family,1983.0


## Test dict phrase recognition

In [21]:
sample_df = mm_full.sample(500)
print(sample_df.shape)
sample_df.text.iloc[0]

(500, 4)


[['humboldt_journal',
  'social',
  'relations_volume',
  'minority',
  'relations',
  'conflict',
  'emerging',
  'european',
  'community',
  'specifically',
  'germany_france',
  'great_britain',
  'lutz',
  'holzner',
  'university_wisconsin',
  'milwaukee_milwaukee',
  'wl',
  'abstract',
  'europe',
  'home',
  'more',
  'distinct',
  'ethnic',
  'national',
  'populations',
  'than',
  'any',
  'other',
  'world',
  'region',
  'comparable',
  'size',
  'almost',
  'all',
  'european',
  'ethnic',
  'national',
  'peoples',
  'have',
  'settled',
  'larger',
  'smaller',
  'more',
  'less',
  'geographically_contiguous',
  'ethnically_homogeneous',
  'homelands',
  'own',
  'territorial',
  'settlement',
  'areas',
  'which',
  'have',
  'fought',
  'throughout',
  'history',
  'secure',
  'themselves',
  'natives',
  'constitute',
  'majority',
  'demand',
  'authority',
  'over',
  'own',
  'core',
  'culture',
  'recent',
  'immigrations',
  'from',
  'third_world',
  'countr

In [22]:
dict_fp = root + 'dictionary_methods/dictionaries/'

In [23]:
# Load original dictionaries
cult_orig = pd.read_csv(dict_fp + 'original/cultural_original.csv', delimiter = '\n', 
                        header=None)[0].apply(lambda x: x.replace(',', ' '))
dem_orig = pd.read_csv(dict_fp + 'original/demographic_original.csv', delimiter = '\n', 
                       header=None)[0].apply(lambda x: x.replace(',', ' '))
relt_orig = pd.read_csv(dict_fp + 'original/relational_original.csv', delimiter = '\n', 
                        header=None)[0].apply(lambda x: x.replace(',', ' '))

In [24]:
# Filter dicts to MWEs/bigrams & trigrams
orig_dicts = (pd.concat((cult_orig, dem_orig, relt_orig))).tolist() # full list of dictionaries
orig_ngrams = set([term for term in orig_dicts if len(term.split()) > 1]) # filter to MWEs
print(len(orig_ngrams))
orig_ngrams

219


{'account rationally',
 'age dependencies',
 'age dependent',
 'avoidance inspection',
 'barrier to entry',
 'barrier to exit',
 'barriers to entry',
 'barriers to exit',
 'board directors',
 'boards directors',
 'business association',
 'business associations',
 'carrying capacity',
 'categorical end',
 'categorical ends',
 'categorical rule',
 'categorical rules',
 'centralization resources',
 'ceremonial action',
 'ceremonial actions',
 'ceremonial assessment',
 'ceremonial assessments',
 'ceremonial conformity',
 'ceremonial evaluation',
 'ceremonial evaluations',
 'ceremonial inspection',
 'ceremonial inspections',
 'ceremonial requirement',
 'ceremonial requirements',
 'ceremonial rule',
 'ceremonial rules',
 'chance survival',
 'chances survival',
 'co operate',
 'co operated',
 'co operates',
 'co operating',
 'co operation',
 'co opt',
 'co optation',
 'co opted',
 'co opting',
 'co opts',
 'coarse grained',
 'collective rationality',
 'competition coefficient',
 'competition 

In [25]:
def fix_ngrams(article, 
               ngrams_list, 
               delimiter = b'_'):
    '''
    Detects and fixes multi-word expressions (MWEs) from ngrams list input. 
    Works with phrases up to three words long (trigrams).
    Returns the input text (article) in same format as input: list of lists of str.
    
    Args:
        article (list): list of lists of words (each list is a sentence)
        ngrams_list (list): list of multi-word expressions, i.e. bigrams and trigrams
        delimiter (str): to join together the words in multi-word expressions
    Returns:
        article_fixed (list): modified list of lists of words (each list is a sentence), with ngrams from list joined with delimiter
    '''
    
    article_fixed = []
    
    for sent in article: # loop over sentences (each a list of words)
        for ngram in ngrams_list:
            sent = re.sub(ngram, ngram.replace(' ', '_'), ' '.join(sent)).split() # replace space in each ngram with delimiter 
        article_fixed.append(sent)
        
    return article_fixed

In [26]:
print(sample_df.shape)
sample_df.head(10)

(500, 4)


Unnamed: 0,text,edited_filename,journal_title,publicationYear
34553,"[[humboldt_journal, social, relations_volume, ...",10.2307_23262733,Humboldt Journal of Social Relations,1993.0
31407,"[[mws_issn, transatlantic, connections, cosmop...",10.2307_24580001,Max Weber Studies,2005.0
12348,"[[quality, outpatient, pediatric, care, influe...",10.2307_2136582,Journal of Health and Social Behavior,1978.0
7498,"[[sources, psychology, religion, journal, arti...",10.2307_1385382,Journal for the Scientific Study of Religion,1979.0
28009,"[[plain_text, page_sequence, book_reviews, str...",10.1086_210293,American Journal of Sociology,1999.0
13523,"[[hitotsubashi_journal, social, studies, ©_hit...",10.2307_43294448,Hitotsubashi Journal of Social Studies,2000.0
54328,"[[maternal_employment, during, northern, vietn...",10.2307_3598348,Social Forces,2004.0
23887,"[[sa, sociological_review, vol, tyranny, conce...",10.2307_44461144,South African Sociological Review,1989.0
61570,"[[strategic_management, journal, vol, national...",10.2307_2486770,Strategic Management Journal,1995.0
75,"[[velma_mcbride, murry, special, section, edit...",10.2307_3599803,Journal of Marriage and Family,2001.0


In [27]:
sample_df.text.iloc[0]

[['humboldt_journal',
  'social',
  'relations_volume',
  'minority',
  'relations',
  'conflict',
  'emerging',
  'european',
  'community',
  'specifically',
  'germany_france',
  'great_britain',
  'lutz',
  'holzner',
  'university_wisconsin',
  'milwaukee_milwaukee',
  'wl',
  'abstract',
  'europe',
  'home',
  'more',
  'distinct',
  'ethnic',
  'national',
  'populations',
  'than',
  'any',
  'other',
  'world',
  'region',
  'comparable',
  'size',
  'almost',
  'all',
  'european',
  'ethnic',
  'national',
  'peoples',
  'have',
  'settled',
  'larger',
  'smaller',
  'more',
  'less',
  'geographically_contiguous',
  'ethnically_homogeneous',
  'homelands',
  'own',
  'territorial',
  'settlement',
  'areas',
  'which',
  'have',
  'fought',
  'throughout',
  'history',
  'secure',
  'themselves',
  'natives',
  'constitute',
  'majority',
  'demand',
  'authority',
  'over',
  'own',
  'core',
  'culture',
  'recent',
  'immigrations',
  'from',
  'third_world',
  'countr

In [28]:
fake_text = [['great', 'ceremonial', 'assessment', 'highly'], ['lovely', 'business', 'association']]
fix_ngrams(fake_text, ngrams_list = orig_ngrams)

[['great', 'ceremonial_assessment', 'highly'],
 ['lovely', 'business_association']]

In [29]:
# Detect & fix MWEs
tqdm.pandas(desc='Fixing dict MWEs')
#print('Fixing dict MWEs...')
sample_df['text'] = sample_df['text'].progress_apply(
    lambda text: fix_ngrams(text, ngrams_list = orig_ngrams))

Fixing dict MWEs:   0%|          | 0/500 [00:00<?, ?it/s]

## Test phraser

In [30]:
def get_phrased(article, phrase_model):
    '''
    Parse phrases in article using phrase-finding model.
    
    Args:
        article: list of lists of words (each list is a sentence)
    Returns:
        article: same format, with phrases inserted where appropriate
    '''
    
    article = [phrase_model[sent] for sent in article] 
        
    return article

In [31]:
def stopwords_jstor(stop = True, junk = True):
    """Define JSTOR words to remove ("stop words"): those used by JSTOR and/or junk formatting words.
    
    Args:
        stop (boolean): whether to return stop words used by JSTOR when creating their ngram files
        junk (boolean): whether to return junk formatting words common in JSTOR's raw OCR text files
    
    Returns:
        combined_stop_words (list of str): words to avoid when dealing with jstor data
    """
    
    # define same stopwords used by JSTOR when creating ngram files
    jstor_stop_words = set(["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]) 
    
    if stop and not junk: # don't combine
        return jstor_stop_words
        
    # define junk/formatting terms to avoid
    junk_words = ['colwidth', 'colname', 'char', 'rowsep', 'colsep', 
                  'oasis', 'pp', 'fn', 'sec', 'pi', 'sc', 'id', 
                  'cyr', 'extcyr', 'caption', 'newcommand', 
                  'normalfont', 'selectfont', 'documentclass', 'aastex', 
                  'declaremathsizes', 'declaretextfontcommand', 
                  'pagestyle', 'xlink:type', 'sub', 'sup', 'nameend', 'pgwide', 
                  'tbody', 'tgroup', 'sup', 'tbfna', 'morerows', 
                  'xlink:href', 'fg.tiff', 'tb.eps', 'df.eps', 'χ', 
                  'xmlns:oasis', 'dtd', 'drd', 'xmlns:oasis', 'http', 
                  'docs.oasis', 'open.org ns'] 
    
    if junk and not stop:
        return junk_words
    
    if stop and junk:
        combined_stop_words = set(list(jstor_stop_words) + junk_words) # simplest way to avoid stopwords and formatting words: combine them!
        return combined_stop_words
    
    
jstor_stopwords = stopwords_jstor(junk = False) # get stopwords for JSTOR (no junk formatting words)

In [32]:
# Add each sentence from each article to empty list, making long list of all sentences:
sent_list = []; sample_df['text'].apply(lambda article: sent_list.extend([sent for sent in article]))
sent_list[:3]

[['humboldt_journal',
  'social',
  'relations_volume',
  'minority',
  'relations',
  'conflict',
  'emerging',
  'european',
  'community',
  'specifically',
  'germany_france',
  'great_britain',
  'lutz',
  'holzner',
  'university_wisconsin',
  'milwaukee_milwaukee',
  'wl',
  'abstract',
  'europe',
  'home',
  'more',
  'distinct',
  'ethnic',
  'national',
  'populations',
  'than',
  'any',
  'other',
  'world',
  'region',
  'comparable',
  'size',
  'almost',
  'all',
  'european',
  'ethnic',
  'national',
  'peoples',
  'have',
  'settled',
  'larger',
  'smaller',
  'more',
  'less',
  'geographically_contiguous',
  'ethnically_homogeneous',
  'homelands',
  'own',
  'territorial',
  'settlement',
  'areas',
  'which',
  'have',
  'fought',
  'throughout',
  'history',
  'secure',
  'themselves',
  'natives',
  'constitute',
  'majority',
  'demand',
  'authority',
  'over',
  'own',
  'core',
  'culture',
  'recent',
  'immigrations',
  'from',
  'third_world',
  'countr

In [33]:
[sent for sent in sample_df['text'].iloc[0]]

[['humboldt_journal',
  'social',
  'relations_volume',
  'minority',
  'relations',
  'conflict',
  'emerging',
  'european',
  'community',
  'specifically',
  'germany_france',
  'great_britain',
  'lutz',
  'holzner',
  'university_wisconsin',
  'milwaukee_milwaukee',
  'wl',
  'abstract',
  'europe',
  'home',
  'more',
  'distinct',
  'ethnic',
  'national',
  'populations',
  'than',
  'any',
  'other',
  'world',
  'region',
  'comparable',
  'size',
  'almost',
  'all',
  'european',
  'ethnic',
  'national',
  'peoples',
  'have',
  'settled',
  'larger',
  'smaller',
  'more',
  'less',
  'geographically_contiguous',
  'ethnically_homogeneous',
  'homelands',
  'own',
  'territorial',
  'settlement',
  'areas',
  'which',
  'have',
  'fought',
  'throughout',
  'history',
  'secure',
  'themselves',
  'natives',
  'constitute',
  'majority',
  'demand',
  'authority',
  'over',
  'own',
  'core',
  'culture',
  'recent',
  'immigrations',
  'from',
  'third_world',
  'countr

In [34]:
from datetime import date # For working with dates & times
thisday = date.today().strftime("%m%d%y")

from gensim.models.phrases import Phrases # for gathering multi-word expressions
phrase_finder = Phrases(sent_list, min_count=5, delimiter='_', threshold=10) 

phraser_fp = f'TEST_phraser_{str(len(sent_list))}_sents_{str(thisday)}.pkl' # Set phraser filepath
phrase_finder.save(phraser_fp) # save dynamic model (can still be updated)
phrase_finder = phrase_finder.freeze() # Freeze model after saving; more efficient, no more updating

tqdm.pandas(desc='Parsing common phrases in texts')
#print(' Parsing common phrases in texts...')
sample_df['text'] = sample_df['text'].progress_apply(
    lambda text: get_phrased(text, phrase_finder))



Parsing common phrases in texts:   0%|          | 0/500 [00:00<?, ?it/s]

## Testing parallelized cleaning

In [56]:
# Import packages
import re, datetime
import string # for one method of eliminating punctuation
from nltk.corpus import stopwords # for eliminating stop words
from sklearn.feature_extraction import text
from nltk.stem.porter import PorterStemmer; ps = PorterStemmer() # approximate but effective (and common) method of normalizing words: stems words by implementing a hierarchy of linguistic rules that transform or cut off word endings
import os # for working with file trees
import numpy as np
from enchant import Dict; check_english_enchant = Dict("en_US")  # dictionary of english words for language filtering 
from nltk.tag import pos_tag #to look for proper nouns when cleaning text

In [57]:
def clean_sentence(sentence, 
                          unhyphenate = True, 
                          lowercase = True, 
                          remove_numbers = True, 
                          remove_acronyms = False, 
                          remove_stopwords = True, 
                          remove_propernouns = False, 
                          return_string = False):
    
    '''
    Cleans up articles by removing unicode formatting and extra whitespaces; 
    re-joining words split by (hyphenated at) end of line; 
    removing numbers (by default) and acronyms (not by default); 
    tokenizing sentences into words using the Apache Lucene Tokenizer (same as JSTOR); 
    lower-casing words; 
    removing stopwords (same as JSTOR), junk formatting words, junk sentence fragments, 
    and proper nouns (the last not by default).
    
    Args:
        sentence (str): sentence that possibly includes spaces and punctuation
        unhyphenate (binary): whether to join any lingering hyphens at end of line (i.e., words ending with '- ')
        lowercase (binary): whether to lower-case each word
        remove_numbers (binary): whether to remove any chars that are digits
        remove_acronyms (binary): whether t
        remove_stopwords (binary): whether to remove stopwords
        remove_propernouns (binary): boolean, removes nouns such as names, etc.
        return_string (binary): return string instead of list of tokens (useful for infersent)  

    Returns:
        list of str: each element of list is a word
    '''
    
    # Replace unicode spaces, tabs, and underscores with spaces, and remove whitespaces from start/end of sentence:
    sentence = sentence.encode('utf-8').decode('utf-8').replace(u"\xa0", u" ").replace(u"\\t", u" ").replace(u"_", u" ").strip(" ")

    if unhyphenate:              
        ls = re.findall(r"\w+-\s\w+", sentence)
        if len(ls) > 0:
            ls_new = [re.sub(r"- ", "", word) for word in ls]
            for i in range(len(ls)):
                sentence= sentence.replace(ls[i], ls_new[i])
                
    if remove_numbers:
        #sentence = re.sub(r"\b[0-9]+\b\s*", "", sentence) # remove words made up of numbers
        #sentence = re.sub(r"\b.*[0-9]+\S*\b\s*", "", sentence) # remove words containing numbers
        sentence = re.sub(r"\d+", "", sentence) # remove numbers from anywhere
        
    sentence = re.sub(r"\b[a-zA-Z]\b", "", sentence) #remove any single letter words
    
    if remove_acronyms:
        sentence = re.sub(r"\b[A-Z][A-Z]+\b\s+", "", sentence)
    
    # Tokenize
    sent_list = sentence.split() #apache_tokenize(sentence, lowercase = lowercase)
        
    # Remove same stopwords as JSTOR, also junk formatting words
    if remove_stopwords:
        stop_words = stopwords_jstor(stop = True, junk = True) # stopwords for JSTOR and junk formatting words
        sent_list = [word for word in sent_list if 
                     word not in stop_words and 
                     ("valign" not in word)] # one more meddlesome formatting word: "valign"
        
    if not remove_stopwords: # at least remove junk words/residual formatting
        junk_words = stopwords_jstor(stop = False, junk = True)
        sent_list = [word for word in sent_list if 
                     word not in junk_words and 
                     ("valign" not in word)] # one more meddlesome formatting word: "valign"
        
    # Remove common sentences made of formatting (junk) words
    blacklist_sents = ['valign bottom oasis entry oasis entry colname colsep rowsep align char char', 
                       'oasis entry oasis entry colname colsep rowsep align char char', 
                       'oasis entry colname colsep rowsep align char char', 
                       'valign bottom oasis entry colname colsep rowsep align char char', 
                       'colsep rowsep oasis entry align char char', 
                       'oasis entry oasis entry colsep rowsep align char char', 
                       'colsep rowsep oasis entry oasis entry align char char', 
                       'bottom entry', 'align center', 'align left', 
                       'colspec colnum', 'usepackage amsbsy', 'usepackage amsfonts', 
                       'usepackage amssymb', 'usepackage bm', 
                       'usepackage mathrsfs', 'usepackage pifont', 
                       'usepackage stmaryrd', 'usepackage textcomp', 
                       'position float', 'alt version', 'mimetype image', 
                       'italic italic', 'italic ij', 'begin document', 
                       'inline formula', 'entry namest', 'frame topbot', 
                       'orient port', 'list item', 'table wrap', 'tbody top', 
                       'disp formula', 'fig group', 'top entry', 
                       'tex math notation latex', 'usepackage amsmath amsxtra', 
                       'usepackage ot ot fontenc', 'renewcommand rmdefault wncyr', 
                       'renewcommand sfdefault wncyss', 'renewcommand encodingdefault ot', 
                       'end document tex math', 'entry align entry', 'entry align left top', 
                       'align right', 'table wrap foot', 'top break entry', 
                       'table xml exchange table model en', 
                       'exchange table', 'label table label', 'tgroup cols align left', 
                       'disp formula df', 'entry align left top entry', 
                       'fig position float fig type figure', 'fig group', 'graphic ', 
                       'bottom yes entry', 'bottom model entry', 'bottom sd entry', 
                       'entry align left top italic df  italic lt entry',
                       'graphic tb eps', 'bottom mean entry', 
                       'bottom mse entry', 'bottom total entry', 
                       'italic df italic two tailed entry', 'label fig label', 
                       'bottom configuration sets entry', 'bottom continuation rate entry', 
                       'bottom continuation rate other configurations entry', 
                       'italic white boys risk italic', 
                       'bottom italic df italic entry']
    if sent_list in blacklist_sents:
        return('')
        
    # If True, include the proper nouns in stop_words_list
    if remove_propernouns:              
#         doc = nlp(sentence) # Create a document object in spacy
#         proper_nouns = gather_propernouns(doc) # Creates a wordbank of proper nouns we should exclude
        #trying to gather proper nouns by passing in pure sentence in gather_propernouns
        proper_nouns = gather_propernouns(sentence)
        print(proper_nouns)
        # Remove each proper noun from sentence:
        sent_list = [word for word in sent_list if word not in proper_nouns]
        #for term in proper_nouns: # Loop over wordbank
        #    sentence = re.sub(term, "", sentence) # Less effective because removes characters from within terms
    
    if return_string:
        return ' '.join(sent_list) # Return clean, tokenized sentence (string)
    
    return sent_list

In [60]:
def preprocess_text(article, 
                    filter_english = False,
                    shorten = False, 
                    longest = 999999, 
                    shortest = 0, 
                    maxlen = 999999, 
                    minlen = 0):
    '''
    Cleans up articles by removing page marker junk, 
    unicode formatting, and extra whitespaces; 
    re-joining words split by (hyphenated at) end of line; 
    removing numbers (by default) and acronyms (not by default); 
    tokenizing sentences into words using the Apache Lucene Tokenizer (same as JSTOR); 
    lower-casing words; 
    removing stopwords (same as JSTOR), junk formatting words, junk sentence fragments, 
    and proper nouns (the last not by default).
    
    Args:
        article (str): lots of sentences with punctuation etc, often long
        filter_english (boolean): if True, keep only sentence words that match PyEnchant English dictionary
        shorten (boolean): if True, shorten sentences to at most maxlen words
        longest (int): number of words in longest article in corpus (get this elsewhere)
        shortest (int): number of words in shortest article in corpus (depends on filtering)
        maxlen (int): maximum number of words to return per article; default is huge number, set lower if shorten == True
        minlen (int): minimum number of words to return per article
        
    Returns:
        list of lists of str: each element of list is a sentence, each sentence is a list of words
    '''
            
    # Remove page marker junk
    article = article.replace('<plain_text><page sequence="1">', '')
    article = re.sub(r'</page>(\<.*?\>)', ' \n ', article)
    article = re.sub(r'<.*?>', '', article)
    article = re.sub(r'<body.*\n\s*.*\s*.*>', '', article)
    
    # Filter to English words, if set to do so
    if filter_english:
        '''
        article_filtered = []
        print(article)
        for sent in article:
            sent = [word for word in sent if check_english_enchant.check(word)]
            article_filtered.append(sent)
        article = article_filtered
        '''
        article = ' '.join([word for sent in article.split('\n') for word in sent if check_english_enchant.check(word)])
    
    # Compute maximum length for this article: from minlen to maxlen, gradated depending on longest
    if shorten:
        article_length = len(article.split()) # tokenize (split by spaces) then count # words in article
        
        if article_length > minlen: # if article is longer than minimum length to extract, decide how much to extract
            maxlen = get_maxlen(article_length, 
                                longest, 
                                shortest, 
                                maxlen, 
                                minlen)
        elif article_length <= minlen: # if article isn't longer than minimum length to extract, just take whole thing
            shorten = False # don't shorten
    
    doc = [] # list to hold tokenized sentences making up article
    numwords = 0 # initialize word counter
    
    if shorten:
        while numwords < maxlen: # continue adding words until reaching maxlen
            for sent in article.split('\n'):
                #sent = clean_sent(sent)
                sent = [word for word in clean_sentence(sent, 
                                                               unhyphenate=True, 
                                                               remove_numbers=True, 
                                                               remove_acronyms=False, 
                                                               remove_stopwords=True, 
                                                               remove_propernouns=False, 
                                                               return_string=False) if word != ''] # remove empty strings

                if numwords < maxlen and len(sent) > 0:
                    gap = int(maxlen - numwords)
                    if len(sent) > gap: # if sentence is bigger than gap between current numwords and max # words, shorten it
                        sent = sent[:gap] 
                    doc.append(sent)
                    numwords += len(sent)

                if len(sent) > 0:
                    doc.append(sent)
                    numwords += len(sent)
    
    else: # take whole sentence (don't shorten)
        for sent in article.split('\n'):
            #sent = clean_sent(sent)
            sent = [word for word in clean_sentence(sent, 
                                                           unhyphenate=True, 
                                                           remove_numbers=True, 
                                                           remove_acronyms=False, 
                                                           remove_stopwords=True, 
                                                           remove_propernouns=False, 
                                                           return_string=False) if word != ''] # remove empty strings
            
            if len(sent) > 0:
                doc.append(sent)

    return doc

In [61]:
from multiprocessing import Pool, cpu_count; cores = cpu_count() # count cores

def parallelize(data, func, numprocesses=8):
    '''
    Splits data into subsets and applies function to each part with apply via `run_on_subset` function in a different process.
    '''
    data_split = np.array_split(data, numprocesses)
    pool = Pool(numprocesses)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

def run_on_subset(func, data_subset, colname, desc=''):
    '''Applies func to colname of data_subset, with progress bar indicated by desc'''
    tqdm.pandas(desc=desc)
    return data_subset[colname].progress_apply(func)

def parallelize_on_rows(data, func, numprocesses=8, colname='', desc=''):
    '''
    Uses multiprocessing to apply func to a single column of a pandas dataframe. Splits the dataframe into subsets and applies function to each part with apply via `run_on_subset` function in a different process.
    So `df.apply(some_func, axis=1)` becomes `parallelize_on_rows(df, some_func)`.
    
    Args:
        data (DataFrame): data on which to operate
        numprocesses (int): how many processes to run/cpus to use
        colname (str): column on which to operate
        desc (str): what the progress bar will say (keep it short)
        func: the function to apply to the data
    Returns:
        data (DataFrame): modified DataFrame
    '''
    
    return parallelize(data, partial(run_on_subset, func, colname='text', desc=''), numprocesses)


print('Cleaning text files...')
tqdm.pandas(desc='Cleaning text files')
articles = parallelize_on_rows(articles,
                               func = partial(preprocess_text, 
                                              filter_english = True,
                                              shorten = False), #longest = 75000, shortest = 1000, maxlen = 1000, minlen = 500))
                               numprocesses = cores-6,
                               colname = 'text', 
                               desc = 'Cleaning text files')

Cleaning text files...


Process ForkPoolWorker-98:
Process ForkPoolWorker-83:
Process ForkPoolWorker-95:
Process ForkPoolWorker-103:
Process ForkPoolWorker-93:
Process ForkPoolWorker-92:
Process ForkPoolWorker-96:
Process ForkPoolWorker-88:
Process ForkPoolWorker-100:
Process ForkPoolWorker-81:
Process ForkPoolWorker-90:
Process ForkPoolWorker-104:
Process ForkPoolWorker-91:
Process ForkPoolWorker-99:
Process ForkPoolWorker-84:
Process ForkPoolWorker-79:


KeyboardInterrupt: 

Process ForkPoolWorker-80:
Process ForkPoolWorker-86:
Process ForkPoolWorker-82:
Process ForkPoolWorker-97:
Process ForkPoolWorker-102:
Process ForkPoolWorker-89:
Process ForkPoolWorker-85:
Process ForkPoolWorker-87:
Process ForkPoolWorker-101:
Process ForkPoolWorker-94:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call 

## Remove HTML

remove html tags in unstructured text data.

In [27]:
import itertools


def get_full_text(text):
    full_text=[]
    for i in text:
        joined = list(itertools.chain(*i))
        full_text.append(" ".join(joined))
    return full_text

mm_full['full_text'] = get_full_text(mm_full['text'])



def remove_tags(article):
    article = re.sub('<plain_text> <page sequence="1">', '', article)
    article = re.sub(r'</page>(\<.*?\>)', ' \n ', article)
    # xml tags
    article = re.sub(r'<.*?>', '', article)
    article = re.sub(r'<body.*\n\s*.*\s*.*>', '', article)
    return article

mm_full['text_no_tags'] = mm_full['full_text'].apply(remove_tags)


## Remove stop words, punctuations, lemmatize

In [28]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 5.8 MB/s eta 0:00:01
Collecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (769 kB)
[K     |████████████████████████████████| 769 kB 103.0 MB/s eta 0:00:01
Installing collected packages: regex, nltk
Successfully installed nltk-3.8 regex-2022.10.31


In [29]:
import nltk
nltk.download('words')

[nltk_data] Downloading package words to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [30]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [31]:
from nltk.corpus import stopwords
import datetime

In [32]:
# Prep dictionaries of English words
from nltk.corpus import words # Dictionary of 236K English words from NLTK
english_nltk = set(words.words()) # Make callable
english_long = set() # Dictionary of 467K English words from https://github.com/dwyl/english-words
# fname =  "english_words.txt" # Set file path to long english dictionary
# with open(fname, "r") as f:
#     for word in f:
#         english_long.add(word.strip())
        
def stopwords_make(vocab_path_old = "", extend_stopwords = False):
    """Create stopwords list. 
    If extend_stopwords is True, create larger stopword list by joining sklearn list to NLTK list."""
                                                     
    stop_word_list = list(set(stopwords.words("english"))) # list of english stopwords

    # Add dates to stopwords
    for i in range(1,13):
        stop_word_list.append(datetime.date(2008, i, 1).strftime('%B'))
    for i in range(1,13):
        stop_word_list.append((datetime.date(2008, i, 1).strftime('%B')).lower())
    for i in range(1, 2100):
        stop_word_list.append(str(i))

    # Add other common stopwords
    stop_word_list.append('00') 
    stop_word_list.extend(['mr', 'mrs', 'sa', 'fax', 'email', 'phone', 'am', 'pm', 'org', 'com', 
                           'Menu', 'Contact Us', 'Facebook', 'Calendar', 'Lunch', 'Breakfast', 
                           'facebook', 'FAQs', 'FAQ', 'faq', 'faqs']) # web stopwords
    stop_word_list.extend(['el', 'en', 'la', 'los', 'para', 'las', 'san']) # Spanish stopwords
    stop_word_list.extend(['angeles', 'diego', 'harlem', 'bronx', 'austin', 'antonio']) # cities with many charter schools

    # Add state names & abbreviations (both uppercase and lowercase) to stopwords
    states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 
              'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 
              'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 
              'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 
              'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WI', 'WV', 'WY', 
              'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 
              'Colorado', 'Connecticut', 'District of Columbia', 'Delaware', 'Florida', 
              'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 
              'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 
              'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 
              'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 
              'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 
              'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 
              'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 
              'Vermont', 'Virginia', 'Washington', 'Wisconsin', 'West Virginia', 'Wyoming' 
              'carolina', 'columbia', 'dakota', 'hampshire', 'mexico', 'rhode', 'york']
    for state in states:
        stop_word_list.append(state)
    for state in [state.lower() for state in states]:
        stop_word_list.append(state)
        
    # Add even more stop words:
    if extend_stopwords == True:
        stop_word_list = text.ENGLISH_STOP_WORDS.union(stop_word_list)
        
    # If path to old vocab not specified, skip last step and return stop word list thus far
    if vocab_path_old == "":
        return stop_word_list

    # Add to stopwords useless and hard-to-formalize words/chars from first chunk of previous model vocab (e.g., a3d0, \fs19)
    # First create whitelist of useful terms probably in that list, explicitly exclude from junk words list both these and words with underscores (common phrases)
    whitelist = ["Pre-K", "pre-k", "pre-K", "preK", "prek", 
                 "1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th", "11th", "12th", 
                 "1st-grade", "2nd-grade", "3rd-grade", "4th-grade", "5th-grade", "6th-grade", 
                 "7th-grade", "8th-grade", "9th-grade", "10th-grade", "11th-grade", "12th-grade", 
                 "1st-grader", "2nd-grader", "3rd-grader", "4th-grader", "5th-grader", "6th-grader", 
                 "7th-grader", "8th-grader", "9th-grader", "10th-grader", "11th-grader", "12th-grader", 
                 "1stgrade", "2ndgrade", "3rdgrade", "4thgrade", "5thgrade", "6thgrade", 
                 "7thgrade", "8thgrade", "9thgrade", "10thgrade", "11thgrade", "12thgrade", 
                 "1stgrader", "2ndgrader", "3rdgrader", "4thgrader", "5thgrader", "6thgrader", 
                 "7thgrader", "8thgrader", "9thgrader", "10thgrader", "11thgrader", "12thgrader"]
    with open(vocab_path_old) as f: # Load vocab from previous model
        junk_words = f.read().splitlines() 
    junk_words = [word for word in junk_words[:8511] if ((not "_" in word) 
                                                         and (not any(term in word for term in whitelist)))]
    stop_word_list.extend(junk_words)
                                                     
    return stop_word_list

In [33]:
stop_words = stopwords_make(vocab_path_old = "", extend_stopwords = False)

In [34]:
import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
# stop_words = stopwords.words('english')
# stop_words.extend(['one', 'two', 'three', 'amp', 'may', 'can', 'new', 'also', 'and'])

import string
import re
import nltk

def word_process(tt):
    """
    helper function to lower text, remove stop words, numbers, and empty strings 
    """
    
    
    
    tt = tt.lower()
    
    punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~=\n'''
    # Removing punctuations in string 
    # Using loop + punctuation string 


    for ele in tt:  
        if ele in punc:  
            tt = tt.replace(ele, " ")  

    # read tokens
    tokens = tt.split()
    lst = [token.translate(punc).lower() for token in tokens ]
    
    #remove stop words
    filtered = []
    for i in lst:
        if i not in stop_words:
            filtered.append(i)
    
    # removing singular numbers and singular letters
    pattern = '[0-9]'
    filtered = [re.sub(pattern, '', i) for i in filtered] 
    new = []
    for inp in filtered:
        new.append(' '.join( [w for w in inp.split() if len(w)>1] ))
        
    # filter out empty strings 
    new = [i for i in new if i] 

    dt = [d.split() for d in new]
    
    return dt


In [35]:
from tqdm import tqdm
tqdm.pandas()

In [36]:
mm_full['processed'] =  mm_full['text_no_tags'].progress_apply(word_process)


mm_full['processed'] = [sum(i, []) for i in mm_full['processed']]

combo_train_df = mm_full

  0%|          | 102/63038 [00:19<3:23:58,  5.14it/s]


KeyboardInterrupt: 

## Split datasets into decades & save

split the dataset into 4 decades.

In [None]:
first_decade = combo_train_df[combo_train_df['publicationYear'] <= 1979]
second_decade = combo_train_df[(combo_train_df['publicationYear'] >= 1980) & (combo_train_df['publicationYear'] <= 1989) ]

third_decade = combo_train_df[(combo_train_df['publicationYear'] >= 1990) & (combo_train_df['publicationYear'] <= 1999) ]


fourth_decade = combo_train_df[(combo_train_df['publicationYear'] >= 2000) & (combo_train_df['publicationYear'] <= 2015) ]


first_decade.to_csv('first_decade.csv')
second_decade.to_csv('second_decade.csv')
third_decade.to_csv('third_decade.csv')
fourth_decade.to_csv('fourth_decade.csv')

## Preprocess text for each decade

### Remove surnames, use enchant library to filter out non-English words

In [None]:
os.getcwd()

In [None]:
first_decade = pd.read_csv('first_decade.csv')
second_decade = pd.read_csv('second_decade.csv')
third_decade = pd.read_csv('third_decade.csv')
fourth_decade = pd.read_csv('fourth_decade.csv')

In [None]:
import pandas as pd
import ast
import pickle

In [None]:
## remove surnames

my_file = open("surnames.txt", "r")
data = my_file.read()
surnames = data.split(",")
surnames = [i.replace("'", '').strip() for i in surnames]
my_file.close()

In [None]:
! pip install pyenchant

In [None]:
import enchant
valid_d = enchant.Dict("en_US") 

In [None]:
def process_for_decade(decade_df, decade):
    """
    remove surnames, filter by enchant dictionary, and filter by word length
    """
    
    p = [ast.literal_eval(i) for i in decade_df['processed']]
    
    processed_again = []

    for i in tqdm(p):
        k = [j for j in i if j not in surnames]
        processed_again.append(k)
        
    processed_twice = []
    for i in tqdm(processed_again):
        k = [el for el in i if el.isalpha() and valid_d.check(el)]
        processed_twice.append(k)
    
#     for k in processed_twice:
#         processed_again2.append([i for i in k if len(i)>2])
    
    with open('processed_corp_enchant_' + decade +'.pkl', 'wb') as f:
        pickle.dump(processed_twice, f)
    
    return processed_twice

In [None]:
import pickle
with open('processed_corp_enchant.pkl', 'rb') as f:
    processed_first_decade = pickle.load(f)

In [None]:
import pickle
with open('processed_corp_enchant.pkl', 'rb') as f:
    processed_second_decade = pickle.load(f)

In [None]:
import pickle
with open('processed_corp_enchant.pkl', 'rb') as f:
    processed_second_decade = pickle.load(f)

In [None]:
processed_first_decade = process_for_decade(first_decade, 'first')

In [None]:
processed_second_decade = process_for_decade(second_decade, 'second')


In [None]:
processed_third_decade = process_for_decade(third_decade, 'third')


In [None]:
processed_fourth_decade = process_for_decade(fourth_decade, 'fourth')

## Train gensim phrased word2vec models

In [None]:
!pip install gensim

In [None]:
from gensim.models.phrases import Phrases

build bigrams to create phased w2v models.

In [None]:

from gensim.models.phrases import Phrases, Phraser
def build_phrases(sentences):
    phrases = Phrases(sentences, min_count = 5, threshold = 7, progress_per = 1000)
    return Phraser(phrases)

In [None]:
bigram1 = build_phrases(processed_first_decade)
# bigram2 = build_phrases(processed_second_decade)
# bigram3 = build_phrases(processed_third_decade)
# bigram4 = build_phrases(processed_fourth_decade)

In [None]:
processed_bigrams1 = [bigram1[i] for i in tqdm(processed_first_decade)]
# processed_bigrams2 = [bigram2[i] for i in processed_second_decade]
# processed_bigrams3 = [bigram3[i] for i in processed_third_decade]
# processed_bigrams4 = [bigram4[i] for i in processed_fourth_decade]

In [None]:
bigram2 = build_phrases(processed_second_decade)
bigram3 = build_phrases(processed_third_decade)
bigram4 = build_phrases(processed_fourth_decade)

In [None]:
processed_bigrams2 = [bigram2[i] for i in processed_second_decade]
processed_bigrams3 = [bigram3[i] for i in processed_third_decade]
processed_bigrams4 = [bigram4[i] for i in processed_fourth_decade]

In [None]:
processed_bigrams_final1 = []
processed_bigrams_final2 = []
processed_bigrams_final3 = []
processed_bigrams_final4 = []

## strip punctuations
for k in processed_bigrams1:
    processed_bigrams_final1.append([i.strip('!"“#$%&\'()*+,-./:;<=>?@[\\]^`{|}~…') for i in k if len(i)>2])

for k in processed_bigrams2:
    processed_bigrams_final2.append([i.strip('!"“#$%&\'()*+,-./:;<=>?@[\\]^`{|}~…') for i in k if len(i)>2])

for k in processed_bigrams3:
    processed_bigrams_final3.append([i.strip('!"“#$%&\'()*+,-./:;<=>?@[\\]^`{|}~…') for i in k if len(i)>2])

for k in processed_bigrams4:
    processed_bigrams_final4.append([i.strip('!"“#$%&\'()*+,-./:;<=>?@[\\]^`{|}~…') for i in k if len(i)>2])

In [None]:
import multiprocessing
from sklearn import utils
cores = multiprocessing.cpu_count()
import gensim
from gensim.test.utils import get_tmpfile

create one model for each decade.

In [None]:
model_decade_1 = gensim.models.Word2Vec(processed_bigrams_final1, vector_size = 300, window = 10,
                                       min_count = 5, sg = 1, alpha = 0.05, epochs = 50, 
                                       batch_words = 10000, workers = cores, seed = 0, negative = 5,ns_exponent = 0.75)



fname = "word2vec_phrased_filtered_enchant_300d_1970_1979_2022_oct30.bin"
model_decade_1.save(fname)
print("Model Saved!")

In [None]:
model_decade_2 = gensim.models.Word2Vec(processed_bigrams_final2, vector_size = 300, window = 10,
                                       min_count = 5, sg = 1, alpha = 0.05, epochs = 50, 
                                       batch_words = 10000, workers = cores, seed = 0, negative = 5,ns_exponent = 0.75)



fname = "word2vec_phrased_filtered_enchant_300d_1980_1989_2022_oct30.bin"
model_decade_2.save(fname)
print("Model Saved!")

In [None]:
model_decade_3 = gensim.models.Word2Vec(processed_bigrams_final3, vector_size = 300, window = 10,
                                       min_count = 5, sg = 1, alpha = 0.05, epochs = 50, 
                                       batch_words = 10000, workers = cores, seed = 0, negative = 5,ns_exponent = 0.75)



fname = "word2vec_phrased_filtered_enchant_300d_1990_1999_2022_oct30.bin"
model_decade_3.save(fname)
print("Model Saved!")

In [None]:
model_decade_4 = gensim.models.Word2Vec(processed_bigrams_final4, vector_size = 300, window = 10,
                                       min_count = 5, sg = 1, alpha = 0.05, epochs = 50, 
                                       batch_words = 10000, workers = cores, seed = 0, negative = 5,ns_exponent = 0.75)



fname = "word2vec_phrased_filtered_enchant_300d_2000_2016_2022_oct30.bin"
model_decade_4.save(fname)
print("Model Saved!")