# Import and clean metadata and ngrams
This notebook pulls the metadata as prepared in notebook *1_import_r.ipynb* removes unwanted items from the corpus, and then pulls in the ngrams for the articles matching on metadata ids. The notebook also recreates a bag of words from the ngram counts to enable the use of countvectorizer in notebooks 3 and 4. 
### Import required libraries 

In [None]:
import pandas as pd
import glob 
import re
import numpy as np
import csv

### Load metadata from R notebook

In [None]:
df_R = pd.read_csv('part1_journal_article_jst_get_article-1.csv', encoding='utf-8', na_filter=False)
df_R.head()

### Initial article count
Make sure ngrams are found for each metadata row.

In [None]:
filesystem_article_count = len(glob.glob("ngrams/*.txt"))
starting_article_count = len(df_R)
assert starting_article_count == filesystem_article_count

### Remove non-article items from corpus
Drop all rows with an article_title of "Front Matter", "The Cover Design", "The Cover", "Back Matter", and "Volume Information."

In [None]:
def row_count_for_column_value(df_R, column, value):
    return len(df_R[df_R[column] == value])

def row_count_for_column_value_regex(df_R, column, regex):
    return len(df_R[df_R[column].str.contains(regex, regex=True, na=False)])

for article_title in ['Front Matter','Back Matter','Volume Information',]:
    if row_count_for_column_value(df_R, 'article_title', article_title) == 0:
        print(f'no articles with title "{article_title}"')
        continue
    df_R = df_R[~(df_R['article_title']==article_title)]
    assert row_count_for_column_value(df_R, 'article_title', article_title) == 0
    
for article_title_regex in ['^The\sCover','^Cover\sDesign',]:
    if row_count_for_column_value_regex(df_R, 'article_title', article_title_regex) == 0:
        print(f'no articles with title "{article_title}"')
        continue
    df_R = df_R[~(df_R['article_title'].str.contains(article_title_regex, regex=True, na=False))]
    assert row_count_for_column_value_regex(df_R, 'article_title', article_title_regex) == 0
    
print("Current article count:", len(df_R))

### Fix journal_pub_id values 
Move values for 'libraryq' into journal_jcode column - these were incorrectly mapped via the R import. You can manually check values with:
```df_R['journal_pub_id'].unique()```

In [None]:
journal_pub_id_count = row_count_for_column_value(df_R, 'journal_pub_id', 'libraryq') 
starting_journal_jcode_count = row_count_for_column_value(df_R, 'journal_jcode', 'libraryq') 
df_R.loc[df_R['journal_pub_id']=='libraryq', 'journal_jcode'] = 'libraryq'
assert row_count_for_column_value(df_R, 'journal_jcode', 'libraryq') == starting_journal_jcode_count + journal_pub_id_count

### Create id column to match against ngram ids
You can check the number of matches via:

```len(df_R['file_name'].str.contains(r"^journal-article-", regex=True, na=False)) # 8099```

In [None]:
assert row_count_for_column_value_regex(df_R, 'file_name', r'^journal-article-') > 0
df_R['file_name'] = df_R['file_name'].str.replace(r'^journal-article-', '', regex=True)
assert row_count_for_column_value_regex(df_R, 'file_name', r'^journal-article-') == 0

### Export to csv for notebook 3 combination with ngrams

In [None]:
df_R.to_csv('output/df-R-cleaned.csv', encoding='utf-8', index=False, header=True)

### Import required libraries to reconstitute bag of words

In [None]:
import nltk
from nltk.stem import SnowballStemmer
nltk.download('stopwords')
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.decomposition import LatentDirichletAllocation

### Create bag of words from ngrams
Recreate texts with all stemmed words (out of order) from ngram word counts to be able to use CountVectorizer for LDA in notebooks 3 and 4. Removes digits. 

In [None]:
n_list = []

stemmer = SnowballStemmer("english", ignore_stopwords=True)

ngrams = glob.iglob("ngrams/*.txt")

for ngram in ngrams:
    word_count_map = {}
    #create n_id from file name
    n_id = ngram[:-11].strip('ngrams/journal-article-') 
    with open(ngram) as csv_ng:
        csvReader = csv.reader(csv_ng, delimiter='\t')
        for row in csvReader:
            word, count = row[0], row[1]
            if word not in word_count_map:
                word_count_map[word] = 0
            word_count_map[word] = word_count_map[word] + int(count)

    #recreate bag of words from word counts to be able to use countvectorizer
    text = []
    for word, count in word_count_map.items():
        transformed_word = stemmer.stem(word) #comment this out, and uncomment the following to proceed w/o stemming
        #transformed_word = word
        word_list = [transformed_word] * count
        text.extend(word_list)
    text = ' '.join(text) #convert list to string
    text = ''.join([i for i in text if not i.isdigit()]) #remove digits from string
        
    n_tup = (n_id, text)
    n_list.append(n_tup) 
    #add list to dataframe at the end of each file
df_n = pd.DataFrame(n_list)
df_n.columns = ['n_id', 'body']

### Sort rows by ngram id and save to csv
CSV will be imported in notebooks 3 and 4 and matched against metadata.

In [None]:
df_n.sort_values(by='n_id')
df_n.to_csv('output/df-n.csv', encoding='utf-8', index=True, header=True)  

#### Note
We are still working with ngrams for all 8,808 articles in the corpus here (pre-metadata cleaning). 
We ignore articles that aren't included in the df-R-cleaned.csv (metadata) list though in the next step.

In [None]:
assert len(df_n) == filesystem_article_count

### Go to notebook 3 >>