In [1]:
#The data, once extracted, is contained within 22 files with .sgm file extension, a legacy markup format.
#To proceed, we need to convert to .xml.  I will use the unix command osx from the library OpenSP.
#OpenSP is not a standard library, but can be installed via the Homebrew Package Manager.
#Homebrew is built for the macOS operating system.
#For other environments, a different solution will need to be found to obtain OpenSP.

#The installation of Homebrew was done with the following command at terminal:
#ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" < /dev/null 2> /dev/null

#With Homebrew installed, I installed the OpenSP package with the following command at terminal:
#brew install open-sp

import requests
import tempfile
from os import path
import tarfile
import subprocess
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.tar.gz'
resp = requests.get(url)
data_dict = {'split':[], 'title':[], 'body':[], 'topics':[], 'topics_count':[], 'freq_vec':[]}

with tempfile.TemporaryDirectory() as tmp_dir:
    targz_path = path.join(tmp_dir, 'reuters21578.tar.gz')
    with open(targz_path, 'wb') as targz_file:
        targz_file.write(resp.content)
    with tarfile.open(targz_path, 'r:gz') as targz_file:
        targz_file.extractall(path=tmp_dir)
    for num in range(22):
        sgm_path = path.join(tmp_dir, 'reut2-0' + str(num).zfill(2) + '.sgm')
        with open(sgm_path, 'r', encoding='cp1252') as sgm_file:
            xml_text = subprocess.run(['osx', '--directory=' + tmp_dir], stdin=sgm_file, \
                stdout=subprocess.PIPE, encoding='utf-8').stdout
        soup = BeautifulSoup(xml_text, 'xml')
        for article in soup.find_all('REUTERS'):
            data_dict['split'].append(article['CGISPLIT'])
            if article.TITLE is not None:
                data_dict['title'].append(article.TITLE.string)
            else:
                data_dict['title'].append('')
            if article.BODY is not None:
                data_dict['body'].append(article.BODY.string)
            else:
                data_dict['body'].append('')
            if (article.TOPICS is not None) & (article.TOPICS.D is not None):
                topics_list = [topic.string for topic in article.TOPICS.find_all('D')]
                data_dict['topics'].append(topics_list)
                data_dict['topics_count'].append(len(topics_list))
            else:
                data_dict['topics'].append([])
                data_dict['topics_count'].append(0)

vectorizer = CountVectorizer(analyzer = 'word', tokenizer = None, preprocessor = None, \
    stop_words = None, max_features = 5000)
train_data_features = vectorizer.fit_transform(data_dict['body'])
for text in data_dict['body']:
    data_dict['freq_vec'].append(vectorizer.transform([text]).toarray())

data = pd.DataFrame.from_dict(data_dict)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21578 entries, 0 to 21577
Data columns (total 6 columns):
body            21578 non-null object
freq_vec        21578 non-null object
split           21578 non-null object
title           21578 non-null object
topics          21578 non-null object
topics_count    21578 non-null int64
dtypes: int64(1), object(5)
memory usage: 1011.5+ KB


In [4]:
data.head()

Unnamed: 0,body,freq_vec,split,title,topics,topics_count
0,Showers continued throughout the week in the B...,"[[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...",TRAINING-SET,BAHIA COCOA REVIEW,[cocoa],1
1,Standard Oil Co and BP North America Inc said ...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",TRAINING-SET,STANDARD OIL lt;SRD TO FORM FINANCIAL UNIT,[],0
2,Texas Commerce Bancshares Inc's Texas Commerce...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",TRAINING-SET,TEXAS COMMERCE BANCSHARES lt;TCB FILES PLAN,[],0
3,BankAmerica Corp is not under pressure to act ...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",TRAINING-SET,TALKING POINT/BANKAMERICA lt;BAC EQUITY OFFER,[],0
4,The U.S. Agriculture Department reported the f...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,...",TRAINING-SET,NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE,"[grain, wheat, corn, barley, oat, sorghum]",6


In [5]:
sum(data.topics_count > 0)

11367