In [1]:

import pandas as pd
from lxml import etree
import io
import json
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import hamming_loss
from sklearn.metrics import precision_score, recall_score, f1_score, jaccard_score
import re
import os

In [2]:

def convert_to_adage_json(df, dataset_id):
    adage_data_model = {
        "data_source": "Australian Financial Review",
        "dataset_type": "News_Articles",
        "dataset_id": dataset_id,
        "time_object": {
            "timestamp": pd.Timestamp.now().isoformat(),
            "timezone": "GMT+11"
        },
        "events": []
    }
    
    for index, row in df.iterrows():
        event = {
            "time_object": {
                "timestamp": row["modified"].isoformat(),
                "duration": 0,
                "duration_unit": "second",
                "timezone": "GMT+11"
            },
            "event_type": "article",
            "attribute": {
                "guid": row["guid"],
                "byline": row["byline"],
                "headline": row["headline"],
                "section": row["section"],
                "publication_date": row["publication_date"].strftime("%Y-%m-%d"),
                "page_no": row["page_no"],
                "classifications": row["classifications"],
                "text": row.get("text")
            }
        }
        
        adage_data_model["events"].append(event)
    
    adage_data_model["time_object"]["timestamp"] = df["modified"].max().isoformat()
    
    return json.dumps(adage_data_model, indent=4)

def process_xml_file(file_path):
    xml_data = open(file_path).read()
    parser = etree.XMLParser(ns_clean=True)
    xml = etree.parse(io.StringIO(xml_data), parser)
    data = []

    for dossier in xml.xpath('//dcdossier'):
        guid = dossier.get('guid')
        modified = dossier.get('modified')
        
        for doc in dossier.xpath('.//document'):
            newspaper_code = doc.xpath('.//NEWSPAPERCODE/text()')
            section = doc.xpath('.//SECTION/text()')
            story_name = doc.xpath('.//STORYNAME/text()')
            publication_date = doc.xpath('.//PUBLICATIONDATE/text()')
            newspaper = doc.xpath('.//NEWSPAPER/text()')
            page_no = doc.xpath('.//PAGENO/text()')
            byline = doc.xpath('.//BYLINE/text()')
            classifications = doc.xpath('.//CLASSIFICATION/text()')
            headline = doc.xpath('.//HEADLINE/text()')
            intro = doc.xpath('.//INTRO/text()')
            text = " ".join(doc.xpath('.//TEXT//text()'))
            
            data.append({
                'guid': guid,
                'modified': pd.to_datetime(modified, errors='coerce', utc=True),
                'section': section[0].strip() if section else None,
                'publication_date': pd.to_datetime(publication_date[0]) if publication_date else None,
                'page_no': page_no[0].strip() if page_no else None,
                'byline': byline[0].strip() if byline else None,
                'classifications': classifications if classifications else None,
                'headline': headline[0].strip() if headline else None,
                'intro': intro[0].strip() if intro else None,
                'text': text.strip() if text else None,
            })

    return pd.DataFrame(data)


def process_all_files(directory):
    all_dataframes = []
    j = 0
    for file_name in os.listdir(directory):
        if file_name.endswith('.xml'):
            file_path = os.path.join(directory, file_name)
            df = process_xml_file(file_path)
            all_dataframes.append(df)
            print(f"Processed data from {file_name}")
            j += 1
        if j == 10: 
            break

    combined_df = pd.concat(all_dataframes, ignore_index=True)
    return combined_df

In [3]:

df = process_all_files('datasets/')
# df = pd.DataFrame(data)
df['modified'] = pd.to_datetime(df['modified'])
df['publication_date'] = pd.to_datetime(df['publication_date'], errors='coerce')

Processed data from AFR_20150101-20150131.xml
Processed data from AFR_20150201-20150201.xml
Processed data from AFR_20150201-20150228.xml
Processed data from AFR_20150301-20150331.xml
Processed data from AFR_20150401-20150430.xml
Processed data from AFR_20150501-20150531.xml
Processed data from AFR_20150601-20150630.xml
Processed data from AFR_20150701-20150731.xml
Processed data from AFR_20150801-20150831.xml
Processed data from AFR_20150901-20150930.xml


In [None]:
df

In [5]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk_stopwords = stopwords.words('english')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ricardo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ricardo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ricardo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def clean_and_prepare_text(input_text, pattern=r'[^\w\s]', to_lowercase=True, filter_stopwords=True, stemming=True):
    cleaned_text = re.sub(pattern, '', input_text)
    if to_lowercase:
        cleaned_text = cleaned_text.lower()
    tokenized_text = word_tokenize(cleaned_text)
    if filter_stopwords:
        tokenized_text = [word for word in tokenized_text if word not in nltk_stopwords]
    if stemming:
        processor = PorterStemmer()
    else:
        processor = WordNetLemmatizer()
    processed_text = [processor.stem(word) if stemming else processor.lemmatize(word) for word in tokenized_text]
    return ' '.join(processed_text)

In [7]:
preprocess_settings = (r'[^\w\s]', False, True, False)
pattern, lower, stopword_removal, stem = preprocess_settings
preprocess_fn = lambda x: clean_and_prepare_text(x, pattern, lower, stopword_removal, stem)
df['pre_processed_text'] = df['text'].apply(preprocess_fn)

In [8]:
df['pre_processed_text'][0]

'A highprofile inquest death teenager Alec Meikle allegedly bullied Downer EDI ended coroner deciding wasnt enough evidence blame bullying suicide The Coroners Court Sydney told teenager Mr Meikle taunted workmate supervisor three month company trainbuilding operation Bathurst Mr Meikle quit moved New Zealand live aunt uncle He committed suicide 40 day arrival The death triggered debate whether workplace bullying responsible suicide NSW Deputy State Coroner Paul McMahon said could make finding recommendation death lack evidence caused NSW Mr Meikles family said deeply disappointed ruling We concerned would take 13 month come conclusion finding We hope medium attention surrounding matter evidence heard publicly lead greater awareness possible devastating consequence harassment family said Mr McMahon said uncontroversial Mr Meikles job significant contributing factor led development depressive condition factor It would speculation find event Downer precipitated action It could well somet

In [10]:
df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))


In [None]:
df

In [12]:
from textblob import TextBlob

def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

df[['sentiment_polarity', 'sentiment_subjectivity']] = df['text'].apply(lambda x: pd.Series(get_sentiment(x)))


In [None]:
df

In [19]:
from gensim import corpora, models

dictionary = corpora.Dictionary([text.split() for text in df['pre_processed_text']])
corpus = [dictionary.doc2bow(text.split()) for text in df['pre_processed_text']]

lda_model = models.LdaMulticore(corpus, num_topics=5, id2word=dictionary, passes=10)

def get_dominant_topic(text):
    bow = dictionary.doc2bow(text.split())
    topics = lda_model.get_document_topics(bow)
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    topic_keywords = ', '.join([word for word, _ in lda_model.show_topic(dominant_topic)])
    return topic_keywords





In [20]:
df['dominant_topic'] = df['pre_processed_text'].apply(get_dominant_topic)


In [None]:
df

In [24]:
def avg_sentence_length(text):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) == 0:
        return 0  # Return 0 if there are no sentences
    return sum(len(nltk.word_tokenize(sentence)) for sentence in sentences) / len(sentences)


In [25]:

df['avg_sentence_length'] = df['text'].apply(avg_sentence_length)


In [None]:
df

In [27]:
from rake_nltk import Rake

def extract_keywords(text):
    rake = Rake()
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()[:5]

df['keywords'] = df['pre_processed_text'].apply(extract_keywords)


In [None]:
df