In [None]:
import pandas as pd
import numpy as np

import pickle
import os
import re
import itertools

from nltk import sent_tokenize, pos_tag

from nltk.tokenize.treebank import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Read-in

In [None]:
def read_pickles(path):
    
    file_names = os.listdir(path)
    doc_list = []
    
    for file_name in file_names:
        file_name = path + '\\' + file_name
        with open(file_name, 'rb') as f:
            doc_list += pickle.load(f)
            
    return(doc_list)

In [None]:
doc_path = r'C:\Users\Duncan\Desktop\School\text mining\FINAL PROJECT\scraped_data'

In [None]:
docs = read_pickles(doc_path)

In [None]:
docs = [doc for doc in docs if '==== Body' in doc]

# Get Body

In [None]:
p = re.compile('==== Body(.*)====')

In [None]:
body_docs = []

for doc in docs:
    search = p.search(doc)
    if search is not None:
        body_docs.append(search.group(0))

# Get Paragraphs

In [None]:
# split on newline

paragraphs = [doc.split('\\n') for doc in body_docs]
flat_paragraphs = list(itertools.chain.from_iterable(paragraphs))
flat_paragraphs = [para for para in flat_paragraphs if len(para) > 0]

In [None]:
# remove paragraph if it doesn't end in end-of-sentence punctuation (probably headers or captions)

eos = ['.', '!', '"', '?']
flat_paragraphs = [para for para in flat_paragraphs if any([para[-1] == punct for punct in eos])]

In [None]:
# remove "et al." which the sentence tokenizer doesn't know how to handle

flat_paragraphs = [re.sub('et al.', 'et al', para) for para in flat_paragraphs]

# Get Sentences

In [None]:
sentences = [sent_tokenize(para) for para in flat_paragraphs]
flat_sentences = list(itertools.chain.from_iterable(sentences))

# Get Tokens

In [None]:
tokens = [tokenizer.tokenize(sentence) for sentence in flat_sentences]

# Make DF

In [None]:
d = {'content': flat_sentences, 'tokens': tokens}
df = pd.DataFrame(data=d)

In [None]:
df.head()

In [None]:
sentence_lengths = [len(token_list) for token_list in df['tokens']]
df['sentence_length'] = sentence_lengths

In [None]:
sns.distplot(df['sentence_length'], bins=200)
plt.xlim([0, 200])
plt.show()

In [None]:
# remove sentences that are too short or long

filtered_df = df[(df['sentence_length'] > 2) & (df['sentence_length'] < 50)].copy()
filtered_df.reset_index(inplace=True, drop=True)

In [None]:
sns.distplot(filtered_df['sentence_length'])
#plt.xlim([0, 200])
plt.show()

In [None]:
len(filtered_df)

# POS Tagging

In [None]:
# remove [] tags from tokenizer to keep citations and references as unified tokens

tokenizer.PARENS_BRACKETS = (re.compile(r'[\(\)\{\}\<\>]', re.UNICODE), ' \\g<0> ')

In [None]:
pos_tags_raw = [pos_tag(token_list) for token_list in filtered_df['tokens']]
filtered_df['raw_pos_tags'] = pos_tags_raw

In [None]:
filtered_df.head()