In [1]:
# Import packages
import pandas as pd
import numpy as np
import xml.etree.cElementTree as et
import spacy
import re
from nltk import tokenize
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', None)

  pd.set_option('display.max_colwidth', -1)


## 1. Extract Metadata: title, publish date, etc.

In [182]:
def extract_meta(path):
    tree=et.parse(path)
    root=tree.getroot()

    meta = {'Keyword': [], 
            'GOID': [], 
            'Title': [], 
            'Contributors': [],
            'Contributor FirstName': [],
            'Contributor LastName': [],
            'Numeric Date': [],
            'Geographic Tag': [],
            'Language': [],
            'Source': [],
            'StartPage': [],
            'DocSection': [],
            'ColumnHeader': [],
            'DocEdition': [],
            'Title Keywords': [],
            'Classification Terms': [],
            'Company Terms': [],
            'Subject Terms': [],
            'Other Terms': [],
            }

    # iteratre over the trees to extract metadata
    for item in root.iter('GOID'):
        meta['GOID'].append(item.text)

    for item in root.iter('TitleAtt'):
        meta['Title'].append(item[0].text)

    for item in root.iter('Geographic'):
        meta['Geographic Tag'].append(item.text)

    for item in root.iter('ISO'):
        meta['Language'].append(item[1].text.strip())

    # Contributor Info
    for item in root.iter('Contributors'):
        for contributor in item:
            meta['Contributors'].append(contributor[0][0].text)

    for item in root.iter('LastName'):
        meta['Contributor LastName'].append(item.text)

    for item in root.iter('FirstName'):
        meta['Contributor FirstName'].append(item.text)

    for item in root.iter('NumericDate'):
        meta['Numeric Date'].append(item.text)

    for item in root.iter('SourceType'):
        meta['Source'].append(item.text)

    for title in root.iter('CompanyTerm'):
        terms = [title.tag, title.attrib, title.text.strip()]
        for item in title:
            terms.append(item.tag)
            terms.append(item.attrib)
        meta['Company Terms'].append(terms)

    for item in root.iter('ClassTerm'):
        meta['Keyword'].append(item[1].text)

    for item in root.iter('GenSubjTerm'):
        meta['Subject Terms'].append(item[0].text)

    for item in root.iter('StartPage'):
        meta['StartPage'].append(item.text)

    df_meta = pd.DataFrame(data=meta)
    return df_meta


## 2. Textfield Parsing & Analysis

### 2.1 Text metadata

In [49]:
# Parse the xml by paragraph and sentence
def extract_paragraphs_new(path):
    tree=et.parse(path)
    root=tree.getroot()
    del_list = []
    for text in root.iter('Text'):
        full = text.text
    for i in range(len(full)):
        if full[i:i+3] == '<p>':
            del_list.append(i)
    temp = {'Paragraph': [], 'Sentence': [], 'Overall Sentence': [], 'Text': []}
    sentence_count = 1
    #temp = {'Paragraph': [], 'Text': []}
    for i in range(len(del_list)-1):
        text = full[(del_list[i]+3):(del_list[i+1])]
        text = text.replace("\n","")
        text = text.replace("<p>","")
        text = text.replace("</p>","")
        text = text.replace("<i>","")
        text = text.replace("</i>","")
        text = text.replace("<b>","")
        text = text.replace("</b>","")
        all_sen = tokenize.sent_tokenize(text)
        for j in range(len(all_sen)):
            temp['Paragraph'].append(i+1)
            temp['Sentence'].append(j+1)
            temp['Overall Sentence'].append(sentence_count)
            temp['Text'].append(all_sen[j])
            sentence_count += 1
    df = pd.DataFrame(data=temp)
    return df

### 2.2 Sentiment Analysis (spaCY)

In [52]:
# for sentiment analysis
from spacytextblob.spacytextblob import SpacyTextBlob
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x219b33afd00>

In [53]:
def get_tags(sentence):
    """Return POS tags of each sentence."""
    res = []
    for token in nlp(sentence):
        res.append((token.text, token.pos_, token.dep_))
    return res

In [54]:
def get_polarity(sentence):
    """Return polarity score of each sentence."""
    token = nlp(sentence)
    polarity = token._.blob.polarity
    return polarity

In [55]:
def get_subjectivity(sentence):
    """Return subjectivity score of each sentence."""
    token = nlp(sentence)
    subjectivity = token._.blob.subjectivity
    return subjectivity

In [56]:
def extract_tokens_plus_meta(doc:spacy.tokens.doc.Doc):
    """Extract tokens and metadata from individual spaCy doc."""
    return [
        (i.text, i.i, i.lemma_, i.ent_type_, i.tag_, 
         i.dep_, i.pos_, i.is_stop, i.is_alpha, 
         i.is_digit, i.is_punct) for i in doc
    ]

def tidy_tokens(docs):
    """Extract tokens and metadata from list of spaCy docs."""
    
    cols = [
        "doc_id", "token", "token_order", "lemma", 
        "ent_type", "tag", "dep", "pos", "is_stop", 
        "is_alpha", "is_digit", "is_punct"
    ]
    
    meta_df = []
    for ix, doc in enumerate(docs):
        meta = extract_tokens_plus_meta(doc)
        meta = pd.DataFrame(meta)
        meta.columns = cols[1:]
        meta = meta.assign(doc_id = ix).loc[:, cols]
        meta_df.append(meta)
        
    return pd.concat(meta_df) 

In [75]:
extracts_2123327034 = extract_paragraphs_new('RE_Sample_XMLs/2123327034.xml')
#extracts_2123327034['Tag'] = extracts_2123327034['Text'].apply(get_tags)
extracts_2123327034['Polarity'] = extracts_2123327034['Text'].apply(get_polarity)
extracts_2123327034['Subjectivity'] = extracts_2123327034['Text'].apply(get_subjectivity)
#extracts_2123327034.to_csv('extracts_2123327034.csv', index=False)

In [76]:
docs_2123327034 = list(nlp.pipe(extracts_2123327034['Text']))
nlp_info_2123327034 = tidy_tokens(docs_2123327034)
#nlp_info_2123327034.to_csv('nlp_info_2123327034.csv', index=False)

In [77]:
# Newspaper
extracts_1545387815 = extract_paragraphs_new('RE_Sample_XMLs/1545387815.xml')
#extracts_1545387815['Tag'] = extracts_1545387815['Text'].apply(get_tags)
extracts_1545387815['Polarity'] = extracts_1545387815['Text'].apply(get_polarity)
extracts_1545387815['Subjectivity'] = extracts_1545387815['Text'].apply(get_subjectivity)
#extracts_1545387815.to_csv('extracts_1545387815.csv', index=False)

In [78]:
yellen_1545387815 = extracts_1545387815[extracts_1545387815['Text'].str.contains('Yellen')]

In [79]:
extracts_1545387815[extracts_1545387815['Text'].str.contains('Wall Street')]

Unnamed: 0,Paragraph,Sentence,Overall Sentence,Text,Polarity,Subjectivity


In [80]:
docs_1545387815 = list(nlp.pipe(extracts_1545387815['Text']))
nlp_info_1545387815 = tidy_tokens(docs_1545387815)
nlp_info_1545387815.to_csv('Sample_Output/nlp_info_1545387815.csv', index=False)

In [81]:
# Newspaper
extracts_1545553533 = extract_paragraphs_new('RE_Sample_XMLs/1545553533.xml')
#extracts_1545553533['Tag'] = extracts_1545553533['Text'].apply(get_tags)
extracts_1545553533['Polarity'] = extracts_1545553533['Text'].apply(get_polarity)
extracts_1545553533['Subjectivity'] = extracts_1545553533['Text'].apply(get_subjectivity)

In [82]:
yellen_1545553533 = extracts_1545553533[extracts_1545553533['Text'].str.contains('Yellen')]

In [83]:
extracts_1545553533[extracts_1545553533['Text'].str.contains('Wall Street')]

Unnamed: 0,Paragraph,Sentence,Overall Sentence,Text,Polarity,Subjectivity


In [84]:
docs_1545553533 = list(nlp.pipe(extracts_1545553533['Text']))
nlp_info_1545553533 = tidy_tokens(docs_1545553533)
nlp_info_1545553533.to_csv('Sample_Output/nlp_info_1545553533.csv', index=False)

In [85]:
# Newspaper
extracts_1545554849 = extract_paragraphs_new('RE_Sample_XMLs/1545554849.xml')
#extracts_1545554849['Tag'] = extracts_1545554849['Text'].apply(get_tags)
extracts_1545554849['Polarity'] = extracts_1545554849['Text'].apply(get_polarity)
extracts_1545554849['Subjectivity'] = extracts_1545554849['Text'].apply(get_subjectivity)

In [86]:
yellen_1545554849 = extracts_1545554849[extracts_1545554849['Text'].str.contains('Yellen')]

In [87]:
extracts_1545554849[extracts_1545554849['Text'].str.contains('Wall Street')]

Unnamed: 0,Paragraph,Sentence,Overall Sentence,Text,Polarity,Subjectivity


In [88]:
docs_1545554849 = list(nlp.pipe(extracts_1545554849['Text']))
nlp_info_1545554849 = tidy_tokens(docs_1545554849)
nlp_info_1545554849.to_csv('Sample_Output/nlp_info_1545554849.csv', index=False)

In [90]:
# News
extracts_1550716133 = extract_paragraphs_new('RE_Sample_XMLs/1550716133.xml')
#extracts_1550716133['Tag'] = extracts_1550716133['Text'].apply(get_tags)
extracts_1550716133['Polarity'] = extracts_1550716133['Text'].apply(get_polarity)
extracts_1550716133['Subjectivity'] = extracts_1550716133['Text'].apply(get_subjectivity)

In [91]:
yellen_1550716133 = extracts_1550716133[extracts_1550716133['Text'].str.contains('Yellen')]

In [92]:
extracts_1550716133[extracts_1550716133['Text'].str.contains('Wall Street')]

Unnamed: 0,Paragraph,Sentence,Overall Sentence,Text,Polarity,Subjectivity


In [93]:
docs_1550716133 = list(nlp.pipe(extracts_1550716133['Text']))
nlp_info_1550716133 = tidy_tokens(docs_1550716133)
nlp_info_1550716133.to_csv('Sample_Output/nlp_info_1550716133.csv', index=False)

In [94]:
# Web
extracts_2046284852 = extract_paragraphs_new('RE_Sample_XMLs/2046284852.xml')
#extracts_2046284852['Tag'] = extracts_2046284852['Text'].apply(get_tags)
extracts_2046284852['Polarity'] = extracts_2046284852['Text'].apply(get_polarity)
extracts_2046284852['Subjectivity'] = extracts_2046284852['Text'].apply(get_subjectivity)

In [95]:
yellen_2046284852 = extracts_2046284852[extracts_2046284852['Text'].str.contains('Yellen')]

In [96]:
# keyword "Wall Street"
extracts_2046284852[extracts_2046284852['Text'].str.contains('Wall Street')]

Unnamed: 0,Paragraph,Sentence,Overall Sentence,Text,Polarity,Subjectivity
0,1,1,1,"Barbara Byrne, one of the most accomplished and powerful women in the banking industry, has seen a lot in her more than 30 years on Wall Street.",0.375,0.625
12,8,1,13,"Though promotion of women on Wall Street has grown by leaps and bounds over the past several decades, Byrne recalls that it wasn’t always that way.",-0.125,0.125
18,11,2,19,"In a recent interview, she discussed a range of issues including the changing gender landscape in the world of banking, the challenges and lessons of juggling motherhood and work, and advice she has for young women looking to make it on Wall Street.",0.05,0.325
19,12,1,20,Have Wall Street and the world of finance become more inclusive work environments for women over the past few decades?,0.016667,0.283333
25,15,1,26,Do you think it’s easier now to be a mother of four and work on Wall Street than it was 30 years ago?,0.0,0.0


In [97]:
docs_2046284852 = list(nlp.pipe(extracts_2046284852['Text']))
nlp_info_2046284852 = tidy_tokens(docs_2046284852)
nlp_info_2046284852.to_csv('Sample_Output/nlp_info_2046284852.csv', index=False)

In [98]:
# Concatenate all the result tables
yellen = pd.concat([yellen_1545387815, yellen_1545553533, yellen_1545554849, yellen_1550716133, yellen_2046284852], 
                   keys = [1545387815, 1545553533, 1545554849, 1550716133, 2046284852]).reset_index()

In [387]:
yellen = yellen.rename(columns={'level_0': 'GOID'})

In [388]:
yellen = yellen.drop(columns=['level_1'])

In [99]:
yellen.to_csv('Sample_Output/yellen.csv', index=False)