In [1]:
#libraries for importing and parsing jstor dfr files
import pandas as pd
import glob 
import re
import numpy as np
import csv

In [2]:
#load the metadata from R
df_R = pd.read_csv('part1_journal_article_jst_get_article-1.csv', encoding='utf-8', na_filter=False)
df_R.head()

Unnamed: 0,file_name,journal_doi,journal_jcode,journal_pub_id,journal_title,article_doi,article_pub_id,article_jcode,article_type,article_title,volume,issue,language,pub_day,pub_month,pub_year,first_page,last_page,page_range
0,journal-article-10.2307_40039664,,libraryq,,"The Library Quarterly: Information, Community,...",,,40039664,book-review,,1,3,eng,1,7,1931,347,349,
1,journal-article-10.2307_4304427,,libraryq,,"The Library Quarterly: Information, Community,...",,,4304427,book-review,,25,2,eng,1,4,1955,193,194,193-194
2,journal-article-10.2307_4305466,,libraryq,,"The Library Quarterly: Information, Community,...",,,4305466,misc,Books Received,34,2,eng,1,4,1964,225,227,225-227
3,journal-article-10.2307_4304137,,libraryq,,"The Library Quarterly: Information, Community,...",,,4304137,research-article,"Timothy Cole and the ""Century""",22,3,eng,1,7,1952,232,239,232-239
4,journal-article-10.2307_4303738,,libraryq,,"The Library Quarterly: Information, Community,...",,,4303738,book-review,,19,1,eng,1,1,1949,59,61,59-61


In [3]:
#how many articles are in the metadata file?
len(df_R)

8808

In [8]:
#remove "articles" of front and back matter from corpus
#drop all rows with article_title of "Front Matter", "The Cover Design", "The Cover", "Back Matter", "Volume Information"
df_R = df_R[~df_R['article_title'].str.contains("^Front\sMatter$", regex=True, na=False)]
df_R = df_R[~df_R['article_title'].str.contains("^Back\sMatter$", regex=True, na=False)]
df_R = df_R[~df_R['article_title'].str.contains("^The\sCover", regex=True, na=False)]
df_R = df_R[~df_R['article_title'].str.contains("^Cover\sDesign", regex=True, na=False)]
df_R = df_R[~df_R['article_title'].str.contains("^Volume\sInformation$", regex=True, na=False)]

In [9]:
#how many articles remain after removing articles above?
len(df_R) #from 8808 articles down to 7773

7773

In [10]:
#move journal_pub_id values for libraryq' into journal_jcode column - these were incorrectly mapped via the R import
#df_R['journal_pub_id'].unique()
df_R.loc[df_R['journal_pub_id']=='libraryq', 'journal_jcode'] = 'libraryq'

In [11]:
#create parent journal id by replacing former title codes
df_R['jid_combined'] = df_R['journal_jcode']
df_R['jid_combined'] = df_R['jid_combined'].str.replace(r"^LQ$", 'libraryq', regex=True)

In [12]:
#create id column to match against ngram ids
df_R['file_name'] = df_R['file_name'].str.replace(r"^journal-article-", "", regex=True)

In [13]:
#export to csv for notebook 2 combination with ngrams
df_R.to_csv('output/df-R-cleaned.csv', encoding='utf-8', index=False, header=True)

In [14]:
import nltk
from nltk.stem import SnowballStemmer
#nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [15]:
#weird/inefficient idea here: I'm reconstituting bag of words from word counts to be able to use count vectorizer
n_list = []

stemmer=SnowballStemmer("english", ignore_stopwords=True)

ngrams = glob.iglob("ngrams/*.txt")
#cycle through each file
for ngram in ngrams:
    text = []
    word = []
    word_count = []
    #create n_id from file name
    n_id = ngram[:-11].strip('ngrams/journal-article-') 
    with open(ngram) as csv_ng:
        csvReader = csv.reader(csv_ng, delimiter='\t')
        for row in csvReader:
            #assign word from csv
            word.append(row[0])
            word_count.append(row[1])
        n = 0
        #recreate bag of words from word counts to be able to use wordvectorizer
        for item in word_count:
            #print(word[n], word_count[n])
            wc_val = int(word_count[n])
            count = 0
            while count <= wc_val:
                text.append(stemmer.stem(word[n])) #comment this out, and uncomment the following to proceed w/o stemming
                #text.append(word[n])
                count = count+1
            n = n+1
        text = ' '.join(text) #convert list to string
        text = ''.join([i for i in text if not i.isdigit()]) #remove digits from string
    n_tup = (n_id, text)
    n_list.append(n_tup)
    #add list to dataframe at the end of each file
df_n = pd.DataFrame(n_list)
df_n.columns = ['n_id', 'body'] 

In [16]:
#sort by id and save to csv
df_n.sort_values(by='n_id')
df_n.to_csv('output/df-n.csv', encoding='utf-8', index=True, header=True)  

In [19]:
#note we created word vectors for all 8,808 articles in the corpus here. 
#we'll ignore those that aren't included in the df-R-cleaned.csv (metadata) list though in the next step
len(df_n)

8808

In [20]:
## STOP HERE and GO TO part 3 ##