# Processing AFR data into a dataframe

In [2]:
import pandas as pd
from lxml import etree
import io
import json

xml_data = open('datasets/AFR_20150101-20150131.xml').read()

parser = etree.XMLParser(ns_clean=True)
xml = etree.parse(io.StringIO(xml_data), parser)

data = []

for dossier in xml.xpath('//dcdossier'):
    guid = dossier.get('guid')
    modified = dossier.get('modified')
    
    for doc in dossier.xpath('.//document'):
        newspaper_code = doc.xpath('.//NEWSPAPERCODE/text()')
        section = doc.xpath('.//SECTION/text()')
        story_name = doc.xpath('.//STORYNAME/text()')
        publication_date = doc.xpath('.//PUBLICATIONDATE/text()')
        newspaper = doc.xpath('.//NEWSPAPER/text()')
        page_no = doc.xpath('.//PAGENO/text()')
        byline = doc.xpath('.//BYLINE/text()')
        classifications = doc.xpath('.//CLASSIFICATION/text()')
        headline = doc.xpath('.//HEADLINE/text()')
        intro = doc.xpath('.//INTRO/text()')
        text = " ".join(doc.xpath('.//TEXT//text()'))
        
        data.append({
            'guid': guid,
            'modified': modified,
            'section': section[0].strip() if section else None,
            'publication_date': publication_date[0] if publication_date else None,
            'page_no': page_no[0].strip() if page_no else None,
            'byline': byline[0].strip() if byline else None,
            'classifications': classifications if classifications else None,
            'headline': headline[0].strip() if headline else None,
            'intro': intro[0].strip() if intro else None,
            'text': text.strip() if text else None,
        })

df = pd.DataFrame(data)


In [None]:

df['modified'] = pd.to_datetime(df['modified'])

In [3]:
df['publication_date'] = pd.to_datetime(df['publication_date'], errors='coerce')

In [None]:
print(df.head())

In [5]:

def convert_to_adage_json(df):
    adage_data_model = {
        "data_source": "Australian Financial Review",
        "dataset_type": "News_Articles",
        "dataset_id": "AFR_2015",
        "time_object": {
            "timestamp": pd.Timestamp.now().isoformat(),
            "timezone": "GMT+11"
        },
        "events": []
    }
    
    for index, row in df.iterrows():
        event = {
            "time_object": {
                "timestamp": row["modified"].isoformat(),
                "duration": 0,
                "duration_unit": "second",
                "timezone": "GMT+11"
            },
            "event_type": "article",
            "attribute": {
                "guid": row["guid"],
                "byline": row["byline"],
                "headline": row["headline"],
                "section": row["section"],
                "publication_date": row["publication_date"].strftime("%Y-%m-%d"),
                "page_no": row["page_no"],
                "classifications": row["classifications"],
                "text": row.get("text")
            }
        }
        adage_data_model["events"].append(event)
    
    adage_data_model["time_object"]["timestamp"] = df["modified"].max().isoformat()
    
    return json.dumps(adage_data_model, indent=4)

In [None]:
print(convert_to_adage_json(df))

In [7]:

with open('AFR_2015.json', 'w') as f:
    f.write(convert_to_adage_json(df))