In [80]:

import pandas as pd
from lxml import etree
import io
import json
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import hamming_loss
from sklearn.metrics import precision_score, recall_score, f1_score, jaccard_score
import re
import os

vectorizer = CountVectorizer()
mlb = MultiLabelBinarizer()
multi_label_nb = MultiOutputClassifier(MultinomialNB(), n_jobs=-1)

def convert_to_adage_json(df, dataset_id):
    adage_data_model = {
        "data_source": "Australian Financial Review",
        "dataset_type": "News_Articles",
        "dataset_id": dataset_id,
        "time_object": {
            "timestamp": pd.Timestamp.now().isoformat(),
            "timezone": "GMT+11"
        },
        "events": []
    }
    
    for index, row in df.iterrows():
        event = {
            "time_object": {
                "timestamp": row["modified"].isoformat(),
                "duration": 0,
                "duration_unit": "second",
                "timezone": "GMT+11"
            },
            "event_type": "article",
            "attribute": {
                "guid": row["guid"],
                "byline": row["byline"],
                "headline": row["headline"],
                "section": row["section"],
                "publication_date": row["publication_date"].strftime("%Y-%m-%d"),
                "page_no": row["page_no"],
                "classifications": row["classifications"],
                "text": row.get("text")
            }
        }
        
        adage_data_model["events"].append(event)
    
    adage_data_model["time_object"]["timestamp"] = df["modified"].max().isoformat()
    
    return json.dumps(adage_data_model, indent=4)

def process_xml_file(file_path):
    xml_data = open(file_path).read()
    parser = etree.XMLParser(ns_clean=True)
    xml = etree.parse(io.StringIO(xml_data), parser)
    data = []

    for dossier in xml.xpath('//dcdossier'):
        guid = dossier.get('guid')
        modified = dossier.get('modified')
        
        for doc in dossier.xpath('.//document'):
            newspaper_code = doc.xpath('.//NEWSPAPERCODE/text()')
            section = doc.xpath('.//SECTION/text()')
            story_name = doc.xpath('.//STORYNAME/text()')
            publication_date = doc.xpath('.//PUBLICATIONDATE/text()')
            newspaper = doc.xpath('.//NEWSPAPER/text()')
            page_no = doc.xpath('.//PAGENO/text()')
            byline = doc.xpath('.//BYLINE/text()')
            classifications = doc.xpath('.//CLASSIFICATION/text()')
            headline = doc.xpath('.//HEADLINE/text()')
            intro = doc.xpath('.//INTRO/text()')
            text = " ".join(doc.xpath('.//TEXT//text()'))
            
            data.append({
                'guid': guid,
                'modified': pd.to_datetime(modified, errors='coerce', utc=True),
                'section': section[0].strip() if section else None,
                'publication_date': pd.to_datetime(publication_date[0]) if publication_date else None,
                'page_no': page_no[0].strip() if page_no else None,
                'byline': byline[0].strip() if byline else None,
                'classifications': classifications if classifications else None,
                'headline': headline[0].strip() if headline else None,
                'intro': intro[0].strip() if intro else None,
                'text': text.strip() if text else None,
            })

    return pd.DataFrame(data)


def process_all_files(directory):
    all_dataframes = []
    j = 0
    for file_name in os.listdir(directory):
        if file_name.endswith('.xml'):
            file_path = os.path.join(directory, file_name)
            df = process_xml_file(file_path)
            all_dataframes.append(df)
            print(f"Processed data from {file_name}")
            j += 1
        # if j == 10: 
        #     break
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    return combined_df


df = process_all_files('datasets/')
# df = pd.DataFrame(data)
# df['modified'] = pd.to_datetime(df['modified'])
# df['publication_date'] = pd.to_datetime(df['publication_date'], errors='coerce')

Processed data from AFR_20150901-20150930.xml
Processed data from AFR_20201001-20201031.xml
Processed data from AFR_20180801-20180831.xml
Processed data from AFR_20190901-20190930.xml
Processed data from AFR_20211101-20211130.xml
Processed data from AFR_20210201-20210228.xml
Processed data from AFR_20211201-20211231.xml
Processed data from AFR_20210901-20210930.xml
Processed data from AFR_20191101-20191130.xml
Processed data from AFR_20160501-20160531.xml
Processed data from AFR_20160301-20160331.xml
Processed data from AFR_20190201-20190228.xml
Processed data from AFR_20191201-20191231.xml
Processed data from AFR_20160601-20160630.xml
Processed data from AFR_20150201-20150201.xml
Processed data from AFR_20170701-20170731.xml
Processed data from AFR_20151101-20151130.xml
Processed data from AFR_20181001-20181031.xml
Processed data from AFR_20170401-20170430.xml
Processed data from AFR_20200801-20200831.xml
Processed data from AFR_20170101-20170131.xml
Processed data from AFR_20151201-2

In [84]:
# Output a json with only guid and author
new_df = df[['guid', 'byline']].copy()

# rename byline to author
new_df.rename(columns={'byline': 'author'}, inplace=True)

# Lowercase author field
new_df['author'] = new_df['author'].apply(lambda x: x.lower() if x is not None else None)

# First only take part of string before " - "
new_df['author'] = new_df['author'].apply(lambda x: x.split('-')[0] if x is not None else None)
new_df['author'] = new_df['author'].apply(lambda x: x.split(';')[0] if x is not None else None)
new_df['author'] = new_df['author'].apply(lambda x: x.split(',')[0] if x is not None else None)
new_df['author'] = new_df['author'].apply(lambda x: x.split(' & ')[0] if x is not None else None)
new_df['author'] = new_df['author'].apply(lambda x: x.split(' and ')[0] if x is not None else None)
new_df['author'] = new_df['author'].apply(lambda x: x.split(' with ')[0] if x is not None else None)
new_df['author'] = new_df['author'].apply(lambda x: x.split(' | ')[0] if x is not None else None)
new_df['author'] = new_df['author'].apply(lambda x: x.split('.')[0] if x is not None else None)
new_df['author'] = new_df['author'].apply(lambda x: x.split(' is ')[0] if x is not None else None)
new_df['author'] = new_df['author'].apply(lambda x: x.split(' of ')[0] if x is not None else None)
new_df['author'] = new_df['author'].apply(lambda x: x.split(' lord ')[0] if x is not None else None)
new_df['author'] = new_df['author'].apply(lambda x: x.split(' by ')[0] if x is not None else None)
# new_df['author'] = new_df['author'].apply(lambda x: re.findall(r'(?:(?<=^)|(?<=[^A-Za-z.,]))[A-Za-z.,]+(?: [A-Za-z.,]+)*(?:(?=[^A-Za-z.,])|(?=$))', x)[0] if x is not None else None)

# Remove common phrases from author field and fix casing
phrases = [
    "Afr Correspondent",
    "Chief Political Correspondent",
    "Political Correspondent",
    "Political",
    "Edited By",
    "Editor",
    "Story",
    "Words",
    "Political Editor",
    "Edited By",
    "Workplace Correspondent",
    "National Affairs Correspondent",
    "Economics Correspondent",
    "Education",
    "Executive",
    "Director",
    "Federal",
    "Treasurer",
    "Investment Strategy Ubs Asset Management",
    "Workplace",
    "Investor",
    "Asia Pacific",
    "Additional Reporting",
    "Special",
    "China",
    "Correspondent",
    "Property",
    "East",
    "Asia",
    "Forum",
    "Pensions",
    "Pension",
    "Limits"
]

def fix_author(author):
    if author is not None:
        for phrase in phrases:
            # author = author.lower().replace(phrase.lower(), '').strip().title()
            # The above line was causing issues with the author names with the same phrase in the middle
            author = re.sub(r'\b' + re.escape(phrase) + r'\b', '', author, flags=re.IGNORECASE).strip().title()
            
        return author
    return None

new_df['author'] = new_df['author'].apply(fix_author)

# Set index
new_df.set_index('guid', inplace=True)

guids = None

# Remove all guids not in guids.json
with open('guids.json') as f:
    guids = json.load(f)
    
new_df = new_df[new_df.index.isin(guids['guids'])]

new_df.to_json('output.json', orient='index', indent=2)

In [4]:
df['classifications'][0]

['Health/Death/Suicides',
 'Labour/Harassment',
 'Company/Downer Edi',
 'Suicide',
 'Cause of death',
 'Psychological illnesses']

In [82]:
from collections import Counter

# def split_labels(label_list):
#     if label_list is None:
#         return []
#     split_labels = []
#     for label in label_list:
#         parts = label.split('/', maxsplit=1)
#         split_labels.append(parts[0])
#         if len(parts) > 1:
#             split_labels.append(parts[1])
#     return split_labels

# all_labels = []
# for labels in df['classifications']:
#     all_labels.extend(split_labels(labels))

# label_counts = Counter(all_labels)
# top_labels = label_counts.most_common(100)

# print("Top 100 labels:")
# for label, count in top_labels:
#     print(f"{label}: {count}")
# from sklearn.cluster import KMeans
# from gensim.models import Word2Vec

# def train_word2vec(labels, min_count=1, vector_size=100):
#     model = Word2Vec(labels, min_count=min_count, vector_size=vector_size)
#     return model

# def get_label_vectors(model, labels):
#     label_vectors = []
#     for label_parts in labels:
#         vector = sum([model.wv[part] for part in label_parts if part in model.wv])
#         label_vectors.append(vector)
#     return label_vectors

# def cluster_labels(label_vectors, n_clusters=10):
#     kmeans = KMeans(n_clusters=n_clusters)
#     kmeans.fit(label_vectors)
#     return kmeans.labels_

# all_labels = df['classifications'].dropna().tolist()

# word2vec_model = train_word2vec(all_labels)

# label_vectors = get_label_vectors(word2vec_model, all_labels)

# n_clusters = 10
# cluster_labels = cluster_labels(label_vectors, n_clusters)

# label_clusters = {}
# for label, cluster in zip(all_labels, cluster_labels):
#     label_clusters['/'.join(label)] = cluster

# for label, cluster in label_clusters.items():
#     print(f"{label}: Cluster {cluster}")

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')

def preprocess(text):
    
    last_word = text.split('/')[-1]
    tokens = word_tokenize(last_word)
    tokens = [token.lower() for token in tokens]
    
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

def select_representative_label(cluster_labels):
    if not cluster_labels:
        return None
    
    label_counts = Counter(cluster_labels)
    representative_label = max(label_counts, key=label_counts.get)
    
    return representative_label

all_labels = [label for labels in df['classifications'] if labels is not None for label in labels]

label_counts = Counter(all_labels)

top_n = 100
top_labels = [label for label, _ in label_counts.most_common(top_n)]

preprocessed_labels = [preprocess(label) for label in top_labels]

vectorizer = TfidfVectorizer()

label_vectors = vectorizer.fit_transform(preprocessed_labels)

n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(label_vectors)

cluster_labels = kmeans.labels_

representative_labels = []
for cluster in range(n_clusters):
    cluster_indices = np.where(cluster_labels == cluster)[0]
    cluster_labels_list = [top_labels[i] for i in cluster_indices]
    
    if cluster_labels_list:
        representative_label = select_representative_label(cluster_labels_list)
        representative_labels.append(representative_label)
    else:
        representative_labels.append(None)

print("Dissimilar labels among the most occurring labels:")
for label in representative_labels:
    if label is not None:
        print(label)
    else:
        print("Empty Cluster")


[nltk_data] Downloading package punkt to /Users/jeremy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeremy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Python(45968) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Dissimilar labels among the most occurring labels:
Landscape
Professional achievement
Economy/Monetary Policy
Manager
Stocks
Investment bank
Internet
Energy carrier
Economic policy
Film industry
Politics/Foreign Relations
Investment
Property/Residential Property/Units
Holding
Profit
Election campaign
Travel advice
Credit
Career planning
Architecture


In [4]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk_stopwords = stopwords.words('english')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ricardo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ricardo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ricardo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:

def clean_and_prepare_text(input_text, pattern=r'[^\w\s]', to_lowercase=True, filter_stopwords=True, stemming=True):
    cleaned_text = re.sub(pattern, '', input_text)
    if to_lowercase:
        cleaned_text = cleaned_text.lower()
    tokenized_text = word_tokenize(cleaned_text)
    if filter_stopwords:
        tokenized_text = [word for word in tokenized_text if word not in nltk_stopwords]
    if stemming:
        processor = PorterStemmer()
    else:
        processor = WordNetLemmatizer()
    processed_text = [processor.stem(word) if stemming else processor.lemmatize(word) for word in tokenized_text]
    return ' '.join(processed_text)



In [6]:
preprocess_settings = (r'[^\w\s]', False, True, False)
pattern, lower, stopword_removal, stem = preprocess_settings
preprocess_fn = lambda x: clean_and_prepare_text(x, pattern, lower, stopword_removal, stem)
df['pre_processed_text'] = df['text'].apply(preprocess_fn)

In [None]:
df

In [8]:
# empty_classifications_df = df[df['classifications'].apply(lambda x: len(x) == 0 if x is not None else True)]
# len(empty_classifications_df)
df['classifications'] = df['classifications'].apply(lambda x: x if isinstance(x, list) else [])
# df = df[df['classifications'].apply(lambda x: len(x) == 0)]
# df

In [9]:
processed_labels = mlb.fit_transform(df['classifications'])


import numpy as np
import sys
np.set_printoptions(threshold=sys.maxsize)
# print(len(processed_labels))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['pre_processed_text'], processed_labels, test_size=0.1, random_state=42)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

multi_label_nb.fit(X_train_vec, y_train)
y_pred = multi_label_nb.predict(X_test_vec)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Hamming Loss: {hamming_loss(y_test, y_pred):.4f}")

precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
jaccard = jaccard_score(y_test, y_pred, average='macro')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Jaccard Score: {jaccard:.4f}")

In [None]:

# def update_dataframe_with_multi_label(df, preprocess_function, label_column='classifications'):
#     updated_df = df.copy()
#     updated_df['processed_text'] = updated_df['text'].apply(preprocess_function)
    
#     mlb = MultiLabelBinarizer()
#     updated_df['processed_labels'] = list(mlb.fit_transform(updated_df[label_column]))
    
#     return updated_df, mlb.classes_


def evaluate_multi_label_classification(preprocess_settings, df, label_column='classifications'):
    global vectorizer, mlb, multi_label_nb
    pattern, lower, stopword_removal, stem = preprocess_settings
    print(f"Configuration: Pattern={pattern}, Lowercase={lower}, Stemming={stem}")

    preprocess_fn = lambda x: clean_and_prepare_text(x, pattern, lower, stopword_removal, stem)
    df['processed_text'] = df['text'].apply(preprocess_fn)

    
    df['classifications'] = df['classifications'].apply(lambda x: x if isinstance(x, list) else [])
    processed_labels = mlb.fit_transform(df['classifications'])

    X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], processed_labels, test_size=0.1, random_state=42)


    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    multi_label_nb.fit(X_train_vec, y_train)


    y_pred = multi_label_nb.predict(X_test_vec)

    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Hamming Loss: {hamming_loss(y_test, y_pred):.4f}")

    y_pred_inversed = mlb.inverse_transform(y_pred)
    y_test_inversed = mlb.inverse_transform(y_test)

    print("\nSample of Actual vs. Predicted Classifications:")
    for i in range(10):
        print(f"Actual: {y_test_inversed[i]}, Predicted: {y_pred_inversed[i]}")







In [None]:

df['classifications'] = df['classifications'].apply(lambda x: x if isinstance(x, list) else [])

preprocess_settings = (r'[^\w\s]', False, True, False)
evaluate_multi_label_classification(preprocess_settings, df, 'classifications')



Configuration: Pattern=[^\w\s], Lowercase=False, Stemming=False
Accuracy: 0.0228
Hamming Loss: 0.0033

Sample of Actual vs. Predicted Classifications:
Actual: ('ECONOMY/COMMODITIES', 'Energy carrier', 'Export', 'Industry/Oil', 'Raw materials'), Predicted: ('Economic forecast', 'Raw materials')
Actual: ('Laws', 'Presidential election', 'Right to vote'), Predicted: ('Election campaign', 'Presidential election')
Actual: ('COMPANY/AMCOR LTD', 'Corporate structure', 'Fusion', 'Labour/Occupations/Management', 'Manager'), Predicted: ('Manager', 'Stock exchange', 'Stocks')
Actual: ('Entrepreneurs', 'Fast climber', 'Labour/Occupations/Management', 'Manager'), Predicted: ()
Actual: ('Economic policy', 'Economy/Finance', 'International economic relations', 'World economy'), Predicted: ()
Actual: ('Heavy industry', 'Industry/Mining/Iron Ore', 'Raw materials', 'Stocks'), Predicted: ('Raw materials', 'Stock exchange', 'Stocks')
Actual: ('Capital market', 'Company/AUSTRALIA & NEW ZEALAND BANKING GROU

In [None]:
df

In [None]:
import numpy as np


def recommend_labels_with_custom_threshold(input_text, threshold=0.001):
    preprocessed_text = clean_and_prepare_text(input_text)
    input_vec = vectorizer.transform([preprocessed_text])
    predicted_probs = multi_label_nb.predict_proba(input_vec)
    predictions = np.zeros((1, len(mlb.classes_)), dtype=int)
    
    for idx, probs in enumerate(predicted_probs):
        if probs.shape[1] == 1:
            predictions[0, idx] = probs[0, 0] >= threshold
        else:
            predictions[0, idx] = probs[0, 1] >= threshold
    
    recommended_labels = mlb.inverse_transform(predictions)
    
    return recommended_labels


user_input = """In today's rapidly evolving economic landscape, understanding the dynamics of the stock market has become more crucial than ever for investors and analysts alike. Amidst fluctuating market conditions, a recent trend has emerged, highlighting a significant shift towards technology and renewable energy sectors. This pivot is largely driven by global demands for sustainability and innovation, reshaping investment strategies across the board.
The surge in tech and green energy investments is not merely a reaction to consumer preferences or environmental concerns; it's a strategic move by savvy investors aiming to capitalize on the future of global markets. Companies leading in renewable energy solutions and technological advancements are now at the forefront of stock market gains, outpacing traditional industries that once dominated the financial landscape.
Moreover, this shift is also influencing policy decisions and corporate strategies, with a growing emphasis on ESG (Environmental, Social, and Governance) criteria. Investors are increasingly factoring in companies' ESG scores when making investment decisions, recognizing that sustainable practices are key to long-term profitability and risk management.
As we navigate through these changing tides, the stock market's landscape continues to evolve, presenting new opportunities and challenges for investors. Staying informed and adaptable is paramount, as the sectors that are leading today may pave the way for the economic paradigms of tomorrow. In essence, the current trends underscore a broader movement towards a more sustainable and technologically driven global economy, heralding a new era in financial investment.
"""
recommended_labels = recommend_labels_with_custom_threshold(user_input)
print(f"Recommended labels for your input: {recommended_labels}")


Recommended labels for your input: [('Achievement', 'Action group', 'Administrative law', 'Aristocrats', 'Business/Transaction Cards Credit Cards', 'COMPANY/AMCOR LTD', 'COMPANY/BOEING CO', 'COMPANY/CITIBANK LTD', 'COMPANY/MERITON PROPERTIES', 'Cattle breeding', 'Children', 'Civil law', 'Civil war', 'Company/Airbus Industries', 'Company/Amazon.com', 'Company/Ausnet', 'Company/Bradken Limited', 'Company/Charter Hall Holdings', 'Company/Coopers Brewery', 'Company/Corporate Travel Managemement Ltd', 'Company/John Fairfax Holdings/Fairfax Media Ltd', 'Company/Manulife Financial Corp', 'Company/Monadelphous Group Ltd', 'Company/NINE NETWORK AUSTRALIA LTD', 'Company/Orica', 'Company/PanAust Ltd', 'Company/Perpetual Investments', 'Company/Ramsay Health Care Ltd', 'Company/Royal Bank of Scotland', 'Company/Royal Dutch Shell/Shell', 'Company/Tesco Plc', 'Company/Transurban Group', 'Company/UBS Group', 'Company/Us Federal Reserve', 'Company/Wam Capital', 'Company/YouTube Inc', 'Credit card', 'Cr