In [20]:
#libraries for importing and parsing jstor dfr files
import pandas as pd
import glob 
import re
import numpy as np
import csv

In [21]:
#load the metadata from R
df_R = pd.read_csv('part1_journal_article_jst_get_article-1.csv', encoding='utf-8', na_filter=False)
df_R.head()

Unnamed: 0,file_name,journal_doi,journal_jcode,journal_pub_id,journal_title,article_doi,article_pub_id,article_jcode,article_type,article_title,volume,issue,language,pub_day,pub_month,pub_year,first_page,last_page,page_range
0,journal-article-10.2307_40039664,,libraryq,,"The Library Quarterly: Information, Community,...",,,40039664,book-review,,1,3,eng,1,7,1931,347,349,
1,journal-article-10.2307_4304427,,libraryq,,"The Library Quarterly: Information, Community,...",,,4304427,book-review,,25,2,eng,1,4,1955,193,194,193-194
2,journal-article-10.2307_4305466,,libraryq,,"The Library Quarterly: Information, Community,...",,,4305466,misc,Books Received,34,2,eng,1,4,1964,225,227,225-227
3,journal-article-10.2307_4304137,,libraryq,,"The Library Quarterly: Information, Community,...",,,4304137,research-article,"Timothy Cole and the ""Century""",22,3,eng,1,7,1952,232,239,232-239
4,journal-article-10.2307_4303738,,libraryq,,"The Library Quarterly: Information, Community,...",,,4303738,book-review,,19,1,eng,1,1,1949,59,61,59-61


In [22]:
#how many articles are in the metadata file?
#ngrams = glob.iglob("ngrams/*.txt") # Fix this so it's a list and not a generator!
starting_article_count = len(df_R)
assert starting_article_count == len(ngrams)

TypeError: object of type 'generator' has no len()

In [4]:
#remove "articles" of front and back matter from corpus
#drop all rows with article_title of "Front Matter", "The Cover Design", "The Cover", "Back Matter", "Volume Information"

def article_count_for_title(df_R, article_title):
    return len(df_R[df_R['article_title']==article_title])

# Remove 'The Cover', since there are no articles with that title.
for article_title in ['Front Matter','Back Matter','The Cover','Cover Design','Volume Information',]:
    if article_count_for_title(df_R, article_title) == 0:
        print(f'no articles with title "{article_title}"')
        continue
    df_R = df_R[~(df_R['article_title']==article_title)]
    assert article_count_for_title(df_R, article_title) == 0
# Add output for number of articles removed?

no articles with title "The Cover"


In [5]:
#how many articles remain after removing articles above? Probably unnecessary, given the tests above.
len(df_R) #from 8808 articles down to 7773
assert len(df_R) < starting_article_count
assert len(df_R) > 0 # Better test here? Is there some way to be more precise about our expectations?

In [6]:
#move journal_pub_id values for 'libraryq' into journal_jcode column - these were incorrectly mapped via the R import
#df_R['journal_pub_id'].unique()
def row_count_for_column_value(df_R, column, value):
    return len(df_R[df_R[column] == value])

journal_pub_id_count = row_count_for_column_value(df_R, 'journal_pub_id', 'libraryq') # 608
starting_journal_jcode_count = row_count_for_column_value(df_R, 'journal_jcode', 'libraryq') # 7477
df_R.loc[df_R['journal_pub_id']=='libraryq', 'journal_jcode'] = 'libraryq'
assert row_count_for_column_value(df_R, 'journal_jcode', 'libraryq') == starting_journal_jcode_count + journal_pub_id_count

In [7]:
#create parent journal id by replacing former title codes
df_R['jid_combined'] = df_R['journal_jcode']
starting_libraryq_count = row_count_for_column_value(df_R, 'jid_combined', 'libraryq') # 8085
starting_LQ_count = row_count_for_column_value(df_R, 'jid_combined', 'LQ') # 0 Is this what we expect? May be a holdover from a larger corpus, so can probably remove.
# print(starting_LQ_count)
df_R['jid_combined'] = df_R['jid_combined'].str.replace('LQ', 'libraryq', regex=False)
assert row_count_for_column_value(df_R, 'jid_combined', 'libraryq') == starting_libraryq_count + starting_LQ_count

In [8]:
#create id column to match against ngram ids
#len(df_R['file_name'].str.contains(r"^journal-article-", regex=True, na=False)) # 8099
def row_count_for_column_value_regex(df_R, column, regex):
    return len(df_R[df_R[column].str.contains(regex, regex=True, na=False)])

assert row_count_for_column_value_regex(df_R, 'file_name', r'^journal-article-') > 0
df_R['file_name'] = df_R['file_name'].str.replace(r'^journal-article-', '', regex=True)
assert row_count_for_column_value_regex(df_R, 'file_name', r'^journal-article-') == 0

In [9]:
#export to csv for notebook 2 combination with ngrams
df_R.to_csv('output/df-R-cleaned.csv', encoding='utf-8', index=False, header=True)

In [10]:
import nltk
from nltk.stem import SnowballStemmer
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/naughton/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
#weird/inefficient idea here: I'm reconstituting bag of words from word counts to be able to use count vectorizer
n_list = []

stemmer = SnowballStemmer("english", ignore_stopwords=True)

ngrams = glob.iglob("ngrams/*.txt")
#cycle through each file

for ngram in ngrams:
    word_count_map = {}
    #create n_id from file name
    n_id = ngram[:-11].strip('ngrams/journal-article-') 
    with open(ngram) as csv_ng:
        csvReader = csv.reader(csv_ng, delimiter='\t')
        for row in csvReader:
            word, count = row[0], row[1]
            if word not in word_count_map:
                word_count_map[word] = 0
            word_count_map[word] = word_count_map[word] + int(count)

    #recreate bag of words from word counts to be able to use wordvectorizer
    text = []
    for word, count in word_count_map.items():
        transformed_word = stemmer.stem(word) #comment this out, and uncomment the following to proceed w/o stemming
        #transformed_word = word
        word_list = [transformed_word] * count
        text.extend(word_list)
    text = ' '.join(text) #convert list to string
    text = ''.join([i for i in text if not i.isdigit()]) #remove digits from string
        
    n_tup = (n_id, text)
    n_list.append(n_tup) 
    #add list to dataframe at the end of each file
df_n = pd.DataFrame(n_list)
df_n.columns = ['n_id', 'body']

In [17]:
#sort by id and save to csv
df_n.sort_values(by='n_id')
df_n.to_csv('output/df-n.csv', encoding='utf-8', index=True, header=True)  

In [18]:
#note we created word vectors for all 8,808 articles in the corpus here. 
#we'll ignore those that aren't included in the df-R-cleaned.csv (metadata) list though in the next step
len(df_n)

8808

In [20]:
## STOP HERE and GO TO part 3 ##