In [None]:
#libraries for importing and parsing jstor dfr files
import pandas as pd
import glob 
import re
import numpy as np
import csv

In [None]:
#load the metadata from R
df_R = pd.read_csv('part1_journal_article_jst_get_article-1.csv', encoding='utf-8', na_filter=False)
df_R.head()

In [None]:
#how many articles are in the metadata file?
filesystem_article_count = len(glob.glob("ngrams/*.txt"))
starting_article_count = len(df_R)
assert starting_article_count == filesystem_article_count

In [None]:
#remove "articles" of front and back matter from corpus
#drop all rows with article_title of "Front Matter", "The Cover Design", "The Cover", "Back Matter", "Volume Information"

def row_count_for_column_value(df_R, column, value):
    return len(df_R[df_R[column] == value])

def row_count_for_column_value_regex(df_R, column, regex):
    return len(df_R[df_R[column].str.contains(regex, regex=True, na=False)])

for article_title in ['Front Matter','Back Matter','Volume Information',]:
    if row_count_for_column_value(df_R, 'article_title', article_title) == 0:
        print(f'no articles with title "{article_title}"')
        continue
    df_R = df_R[~(df_R['article_title']==article_title)]
    assert row_count_for_column_value(df_R, 'article_title', article_title) == 0
    
for article_title_regex in ['^The\sCover','^Cover\sDesign',]:
    if row_count_for_column_value_regex(df_R, 'article_title', article_title_regex) == 0:
        print(f'no articles with title "{article_title}"')
        continue
    df_R = df_R[~(df_R['article_title'].str.contains(article_title_regex, regex=True, na=False))]
    assert row_count_for_column_value_regex(df_R, 'article_title', article_title_regex) == 0
    
# Add output for number of articles removed?
len(df_R)

In [None]:
#move journal_pub_id values for 'libraryq' into journal_jcode column - these were incorrectly mapped via the R import
#df_R['journal_pub_id'].unique()
journal_pub_id_count = row_count_for_column_value(df_R, 'journal_pub_id', 'libraryq') # 608
starting_journal_jcode_count = row_count_for_column_value(df_R, 'journal_jcode', 'libraryq') # 7477
df_R.loc[df_R['journal_pub_id']=='libraryq', 'journal_jcode'] = 'libraryq'
assert row_count_for_column_value(df_R, 'journal_jcode', 'libraryq') == starting_journal_jcode_count + journal_pub_id_count

In [None]:
#create parent journal id by replacing former title codes
df_R['jid_combined'] = df_R['journal_jcode']
starting_libraryq_count = row_count_for_column_value(df_R, 'jid_combined', 'libraryq') # 8085
starting_LQ_count = row_count_for_column_value(df_R, 'jid_combined', 'LQ') # 0 Is this what we expect? May be a holdover from a larger corpus, so can probably remove.
# print(starting_LQ_count) # Though this is 0, leaving this cell here for now, since other code uses the jid_combined column.
df_R['jid_combined'] = df_R['jid_combined'].str.replace('LQ', 'libraryq', regex=False)
assert row_count_for_column_value(df_R, 'jid_combined', 'libraryq') == starting_libraryq_count + starting_LQ_count

In [None]:
#create id column to match against ngram ids
#len(df_R['file_name'].str.contains(r"^journal-article-", regex=True, na=False)) # 8099
assert row_count_for_column_value_regex(df_R, 'file_name', r'^journal-article-') > 0
df_R['file_name'] = df_R['file_name'].str.replace(r'^journal-article-', '', regex=True)
assert row_count_for_column_value_regex(df_R, 'file_name', r'^journal-article-') == 0

In [None]:
#export to csv for notebook 2 combination with ngrams
df_R.to_csv('output/df-R-cleaned.csv', encoding='utf-8', index=False, header=True)

In [None]:
import nltk
from nltk.stem import SnowballStemmer
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
#weird/inefficient idea here: I'm reconstituting bag of words from word counts to be able to use count vectorizer
n_list = []

stemmer = SnowballStemmer("english", ignore_stopwords=True)

ngrams = glob.iglob("ngrams/*.txt")

for ngram in ngrams:
    word_count_map = {}
    #create n_id from file name
    n_id = ngram[:-11].strip('ngrams/journal-article-') 
    with open(ngram) as csv_ng:
        csvReader = csv.reader(csv_ng, delimiter='\t')
        for row in csvReader:
            word, count = row[0], row[1]
            if word not in word_count_map:
                word_count_map[word] = 0
            word_count_map[word] = word_count_map[word] + int(count)

    #recreate bag of words from word counts to be able to use wordvectorizer
    text = []
    for word, count in word_count_map.items():
        transformed_word = stemmer.stem(word) #comment this out, and uncomment the following to proceed w/o stemming
        #transformed_word = word
        word_list = [transformed_word] * count
        text.extend(word_list)
    text = ' '.join(text) #convert list to string
    text = ''.join([i for i in text if not i.isdigit()]) #remove digits from string
        
    n_tup = (n_id, text)
    n_list.append(n_tup) 
    #add list to dataframe at the end of each file
df_n = pd.DataFrame(n_list)
df_n.columns = ['n_id', 'body']

In [None]:
#sort by id and save to csv
df_n.sort_values(by='n_id')
df_n.to_csv('output/df-n.csv', encoding='utf-8', index=True, header=True)  

In [None]:
#note we created word vectors for all 8,808 articles in the corpus here. 
#we'll ignore those that aren't included in the df-R-cleaned.csv (metadata) list though in the next step
assert len(df_n) == filesystem_article_count

In [None]:
## STOP HERE and GO TO part 3 ##