In [1]:
# Packages
import numpy as np
import pandas as pd
import os
import xml.etree.ElementTree as ET

## 1. Data Preprocessing

### 1.1 FileSetup

In [2]:
os.chdir('C:/Users/ConquerV/Documents/2022Summer/NLP_MacroEconIndiactors/TextMining')
path = os.getcwd()
print(path)
xmlfile = 'journal_sample.xml'

C:\Users\ConquerV\Documents\2022Summer\NLP_MacroEconIndiactors\TextMining


### Web-based xml (Kept for future use)

In [86]:
# def loadRSS():
    # url of rss feed
#     url = '.xml'
  
#     # creating HTTP response object from given url
#     resp = requests.get(url)
  
#     # saving the xml file
#     with open('.xml', 'wb') as f:
#         f.write(resp.content)

### 1.2 XML Parsing

In [3]:
def parseXML(xmlfile):
    
    # create element tree object
    tree = ET.parse(xmlfile)
    root = tree.getroot()
    
    # find target element tree
    obj_root = tree.find('Obj')
    
    # create data arrays
    author = []
    title = []
    numeric_date = []
    alpha_date = []
    language = []
    source = []
    keywords = []
    
    # iteratre over the trees to extract metadata
    for item in obj_root:
        if item.tag == 'TitleAtt':
            title.append(item[0].text)

        if item.tag == 'Language':
            language.append(item[0].text.strip())

        if item.tag == 'Author':
            title.append(item[0].text)

        if item.tag == 'NumericDate':
            numeric_date.append(item.text)

        if item.tag == 'AlphaDate':
            alpha_date.append(item.text)

        if item.tag == 'Contributors':
            for contributor in item:
                author.append(contributor[0][0].text)

        if item.tag == 'Terms':
            for term in item:
                keywords.append(term[0].text)
        

    return author, title, numeric_date, alpha_date, language, source, keywords

In [4]:
tree = ET.parse(xmlfile)
root = tree.getroot()

# Clean-up TextField
text_field = tree.find('Obj').find('Abstract').find('Medium').find('AbsText')
abs_text = text_field.text.strip('</p></body></html>').split('<p>')[1].split('.')

paragraphs = []
sentences = []
n_sentence = 1
texts = []

# Extract anstract info
for line in abs_text:
    # currently a placeholder as abstract only contain one paragraph
    paragraphs.append(1)
    sentences.append(n_sentence)
    texts.append(line.strip(''))
    n_sentence += 1

data = {'Paragraph': paragraphs,
        'Sentence': sentences,
        'Texts': texts
}

text_df = pd.DataFrame(data)
text_df.to_csv('Journal_sample.csv')


In [5]:
# Extract Meta data from xml file
author, title, numeric_date, alpha_date, language, source, keywords = parseXML(xmlfile)

# Clean-up Text Data

# Store extracted info

## 2. Sentiment Analysis

### 2.1 spaCY Named Entity Recognition (NER)

### 2.1.1 POS Tags

In [6]:
import spacy
from spacy import displacy

In [7]:
# Load spaCY NER
nlp = spacy.load('en_core_web_sm')
text_df['NER_Tag'] = text_df['Texts'].apply(nlp)

texts = []
pos = []
tags = []
deps = []
ents = []
p_explains = []
t_explains = []

# Extracting NER
for sentence in text_df['NER_Tag']:
    for token in sentence:
        texts.append(token.text)
        pos.append(token.pos_)
        tags.append(token.tag_)
        deps.append(token.dep_)
        ents.append(token.ent_type_)
        p_explains.append(spacy.explain(token.pos_))
        t_explains.append(spacy.explain(token.tag_))

# Construct Dataframe
nlp_data = {'Text': texts,
            'POS': pos,
            'Dep': deps,
            'ENT_Type': ents,
            'POS Explains': p_explains,
            'Tag Explains': t_explains

}

nlp_df = pd.DataFrame(nlp_data)
nlp_df.to_csv('sample_ner.csv')

# NER Tags Visualization
displacy.render(text_df['NER_Tag'][0], style='dep', jupyter=True)
displacy.render(text_df['NER_Tag'], style='ent', jupyter=True)





### 2.1.2 Sentiment Analysis

In [8]:
# add sentiment analysis to the pipeline
from spacytextblob.spacytextblob import SpacyTextBlob
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x1ed388c41f0>

In [9]:
sentences = []
subjectivity = []
polarity = []
sentiments = []

sent_data = {'Sentence': sentences,
            'Subjectivity Score': subjectivity,
            'Polarity Score': polarity,
            'Sentiment Assessment': sentiments
}

for sentence in text_df['NER_Tag']:
    sentences.append(sentence)
    token = nlp(sentence)
    subjectivity.append(token._.blob.subjectivity)
    polarity.append(token._.blob.polarity)
    sentiments.append(token._.blob.sentiment_assessments.assessments)

sentiments_df = pd.DataFrame(sent_data)
sentiments_df.to_csv('sample_sentiment.csv')

## 3 Time Series Sample

In [10]:
sentiments_df.head

<bound method NDFrame.head of                                              Sentence  Subjectivity Score  \
0   (The, Chinese, citrus, fly, ,, <, i, >, Bactro...            0.633333   
1   ( , This, univoltine, insect, enters, obligato...            0.500000   
2   ( , In, this, study, ,, the, course, of, diapa...            0.000000   
3   ( , In, addition, ,, the, variation, of, trans...            0.250000   
4   ( , A, total, of, 4,808, genes, were, signific...            0.531250   
5   ( , Gene, expression, profiles, were, validate...            0.000000   
6   ( , In, addition, ,, 48, metabolites, were, id...            0.000000   
7   ( , Nine, of, which, significantly, contribute...            0.937500   
8   ( , Moreover, ,, the, samples, collected, with...            0.418750   
9   ( , These, findings, greatly, improve, our, un...            0.750000   
10  ( , minax, <, /i, >, diapause, and, lay, the, ...            0.500000   
11                                            