In [1]:
import pandas as pd
from ftfy import fix_text # Fix unicode issues
import csv
import re
import spacy
import gensim
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
from gensim.corpora import Dictionary

In [2]:
CURSOR = 4
ARTICES_SELECT = f'./output/select/SIOE_journals_1998_2019_{CURSOR}.pickle'
ARTICES_SELECT_CLEAN = f'./output/select/SIOE_journals_1998_2019_{CURSOR}_clean.pickle'
GB_US_synonyms = './sources/gb-us-synonyms.txt'
noisy_pos_tags = ["PROP","DET","PART","CCONJ","ADP","PRON","VERB","ADJ"]
# DET  = definite or indefinite article
# ADP  = conjunction, subordinating or preposition
# PART = adverb, particle
# ADP  = postposition => in
# PRON = pronoun, personal => I
# VERB, ADJ
min_token_length = 3 # only words <= min_token_length

In [3]:
df = pd.read_pickle(ARTICES_SELECT)

In [4]:
df.head(2)

Unnamed: 0,id,review,year,title,keywords,abstract
1,4405,American Economic Review,2019,A Macroeconomic Model of Price Swings in the H...,Credit; Finance; Financial Market; Housing; Ma...,This paper shows that a macro model with segme...
3,4461,American Economic Review,2019,A Spatial Knowledge Economy,Cities; Skill; Spatial,Leading empiricists and theorists of cities ha...


In [5]:
# df.groupby(['year']).size()

In [6]:
df['title_abstract_keys'] = df['title'] + ' ' + df['abstract'] + ' ' + df['keywords']

In [7]:
# df['title_abstract'][1]

In [8]:
def is_noise(token):
    '''
    standard way to validate spacy tokens
    This method validate all the passed tokens and set true false on it
    '''
    is_noise = False
    if token.pos_ in noisy_pos_tags:
        is_noise = True 
    elif token.is_stop == True:
        is_noise = True
    elif token.is_digit == True:
        is_noise = True
    elif token.is_punct == True:
        is_noise = True
    elif token.is_space == True:
        is_noise = True
    elif len(token.string) <= min_token_length:
        is_noise = True
    return is_noise 

In [9]:
nlp = spacy.load('en_core_web_sm')

In [10]:
my_stop_words = ['abstract', 'article', 'paper', 'effect', 'find', 'iii', 'e.g.', 'i.e.', 'al.', 'evidence', \
                 'result', 'results', 'author', 'authors', 'v.s.']
for stop in my_stop_words:
    nlp.vocab[stop].is_stop = True

In [11]:
def get_list(file): 
    with open(file, mode='r') as file:
        terms = csv.reader(file)
        return {rows[0]:rows[1] for rows in terms}

In [12]:
def gb_to_us(words):
    '''
    Replace British English with American English
    Important since it concerns an international conference
    e.g. both organisation and organization terms are used regularly
    source : https://github.com/7digital/synonym-list/
    '''
    gb_us = get_list(GB_US_synonyms)
    output = list()
    words = words.split()
    for w in words:
        if w in gb_us:
            # print(w)
            w = gb_us[w]
        output.append(w)
    return ' '.join(output)

In [13]:
punctuation_to_space = lambda tweet: re.sub(r"[(){}\"'’,.;@#?!&%$/\\]+\ *", " ", tweet)

In [14]:
article = []
texts = {}
for index, row in df.iterrows():
    words = str(row['title_abstract_keys'])
    words = fix_text(words)
    words = words.lower()
    words = punctuation_to_space(words)
    words = gb_to_us(words)
    words = " ".join(words.split()) # remove multiplple spaces and line breaks
    if(len(words.split()) >= 30): # Only title + abstract with at least 30 words 
        nlp_words = nlp(words)
        for word in nlp_words:
            if not is_noise(word):
                article.append(word.lemma_)
        texts[row.id] = article
        article = []

In [15]:
def insert_in_dataframe(idd):
    return texts[idd] if idd in texts else False

In [16]:
df['title_abstract_keys_clean'] = df.id.apply(insert_in_dataframe)
len(df)

14495

In [17]:
df = df[df['title_abstract_keys_clean'] != False]
len(df)

13415

In [18]:
df.head(2)

Unnamed: 0,id,review,year,title,keywords,abstract,title_abstract_keys,title_abstract_keys_clean
1,4405,American Economic Review,2019,A Macroeconomic Model of Price Swings in the H...,Credit; Finance; Financial Market; Housing; Ma...,This paper shows that a macro model with segme...,A Macroeconomic Model of Price Swings in the H...,"[model, price, swing, housing, market, model, ..."
3,4461,American Economic Review,2019,A Spatial Knowledge Economy,Cities; Skill; Spatial,Leading empiricists and theorists of cities ha...,A Spatial Knowledge Economy Leading empiricist...,"[knowledge, economy, empiricist, theorist, cit..."


In [19]:
texts = list(df.title_abstract_keys_clean)

In [20]:
bigram = gensim.models.Phrases(texts) # create bigrams

In [21]:
texts = [bigram[line] for line in texts]
texts[30:35]

[['effect',
  'policy',
  'change',
  'area',
  'eligibility',
  'criterion',
  'program',
  'job',
  'investment',
  'subsidy',
  'rule',
  'area',
  'subsidy',
  'variable',
  'area',
  'eligibility',
  'parameter',
  'rule',
  'area',
  'subsidy',
  'significantly',
  'job',
  'unemployment',
  'percentage_point',
  'increase',
  'investment',
  'subsidy',
  'increase',
  'employment',
  'solely',
  'firm',
  'company',
  'subsidy',
  'activity',
  'effect',
  'investment',
  'employment',
  'firm',
  'factor_productivity',
  'employment',
  'firm',
  'firm',
  'policy',
  'investment',
  'subsidy',
  'unemployment'],
 ['strategy',
  'choice',
  'infinitely_prisoner',
  'dilemma',
  'design',
  'reliably',
  'subject',
  'strategy',
  'infinitely_prisoner',
  'dilemma',
  'experiment',
  'monitoring',
  'strategy',
  'majority',
  'strategy',
  'tat',
  'addition',
  'strategy',
  'systematically',
  'parameter',
  'game',
  'finally',
  'strategy',
  'ability',
  'strategy',
  'met

In [22]:
for index, text in enumerate(texts):
    df['title_abstract_keys_clean'][index] = text

In [23]:
df.head(5)

Unnamed: 0,id,review,year,title,keywords,abstract,title_abstract_keys,title_abstract_keys_clean
1,4405,American Economic Review,2019,A Macroeconomic Model of Price Swings in the H...,Credit; Finance; Financial Market; Housing; Ma...,This paper shows that a macro model with segme...,A Macroeconomic Model of Price Swings in the H...,"[model, price, swing, housing, market, model, ..."
3,4461,American Economic Review,2019,A Spatial Knowledge Economy,Cities; Skill; Spatial,Leading empiricists and theorists of cities ha...,A Spatial Knowledge Economy Leading empiricist...,"[knowledge, economy, empiricist, theorist, cit..."
4,4428,American Economic Review,2019,Alcohol and Self-Control: A Field Experiment i...,Earnings; Experiment; Experiments; Field Exper...,This paper studies alcohol consumption among l...,Alcohol and Self-Control: A Field Experiment i...,"[alcohol, self, control, field, experiment, in..."
7,4440,American Economic Review,2019,Auctions with Limited Commitment,Auction; Equilibrium,We study the role of limited commitment in a s...,Auctions with Limited Commitment We study the ...,"[auction, commitment, role, commitment, auctio..."
10,4562,American Economic Review,2019,Bayesian Identification: A Theory for State-De...,Belief; Hiring; Preference; Revealed Preferenc...,We provide a revealed preference methodology f...,Bayesian Identification: A Theory for State-De...,"[bayesian, identification, theory, state, util..."


In [24]:
df.to_pickle(ARTICES_SELECT_CLEAN)