# Assignment 2: Document Classification Pipeline

## Reuters data set

In [2]:
!wget -N http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz

--2021-05-03 08:41:10--  http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz
Resolving kdd.ics.uci.edu (kdd.ics.uci.edu)... 128.195.1.86
Connecting to kdd.ics.uci.edu (kdd.ics.uci.edu)|128.195.1.86|:80... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘reuters21578.tar.gz’ not modified on server. Omitting download.



In [6]:
!mkdir reuters_data

mkdir: cannot create directory ‘reuters_data’: File exists


In [7]:
!tar -xzf reuters21578.tar.gz -C reuters_data

In [8]:
!ls reuters_data

all-exchanges-strings.lc.txt	    README.txt	   reut2-007.sgm  reut2-015.sgm
all-orgs-strings.lc.txt		    reut2-000.sgm  reut2-008.sgm  reut2-016.sgm
all-people-strings.lc.txt	    reut2-001.sgm  reut2-009.sgm  reut2-017.sgm
all-places-strings.lc.txt	    reut2-002.sgm  reut2-010.sgm  reut2-018.sgm
all-topics-strings.lc.txt	    reut2-003.sgm  reut2-011.sgm  reut2-019.sgm
cat-descriptions_120396.txt	    reut2-004.sgm  reut2-012.sgm  reut2-020.sgm
feldman-cia-worldfactbook-data.txt  reut2-005.sgm  reut2-013.sgm  reut2-021.sgm
lewis.dtd			    reut2-006.sgm  reut2-014.sgm  test.txt


### Parsing

In [371]:
import sklearn
import numpy as np
import os
import re

In [372]:
# Make list of relevant files
files = []
for file in os.listdir('reuters_data/'):
    if file.startswith('reut2'):
        files.append(file)

# Split files into articles
texts = []
for file in files:    
    with open('reuters_data/'+file, 'r', errors='ignore') as infile:
        raw_text = ''
        for line in infile:
            raw_text += line
            if line == '</REUTERS>\n':
                raw_text = raw_text.replace('\n', ' ')
                texts.append(raw_text)
                raw_text = ''


In [373]:
# Create lists of categories
topics = []
places = []
people = []
orgs = []
exchanges = []
companies = []
body = []

for i in range(len(texts)):
    topic = re.search('<TOPICS>([\w\W]*)<\/TOPICS>', texts[i])
    topics.append(topic.group(1).replace('<D>', '').replace('</D>', ' ').split())
    
    place = re.search('<PLACES>([\w\W]*)<\/PLACES>', texts[i])
    places.append(place.group(1).replace('<D>', '').replace('</D>', ' ').split())
    
    peopl = re.search('<PEOPLE>([\w\W]*)<\/PEOPLE>', texts[i])
    people.append(peopl.group(1).replace('<D>', '').replace('</D>', ' ').split())
    
    org = re.search('<ORGS>([\w\W]*)<\/ORGS>', texts[i])
    orgs.append(org.group(1).replace('<D>', '').replace('</D>', ' ').split())
    
    exchange = re.search('<EXCHANGES>([\w\W]*)<\/EXCHANGES>', texts[i])
    exchanges.append(exchange.group(1).replace('<D>', '').replace('</D>', ' ').split())
    
    company = re.search('<COMPANIES>([\w\W]*)<\/COMPANIES>', texts[i])
    companies.append(company.group(1).replace('<D>', '').replace('</D>', ' ').split())
    
    bod = re.search('<BODY>([\w\W]*)<\/BODY>', texts[i])
    title = re.search('<TITLE>([\w\W]*)<\/TITLE>', texts[i])
    try:
        body.append(title.group(1).lower() + ' ' + bod.group(1))
    except:
        body.append('')
                      

In [374]:
# Assign labels for each article to y
y_raw = []
for i in range(len(topics)):
    categories = [topics, places, people, orgs, exchanges, companies]
    y_topics = []
    for category in categories:
        try:
            y_topics.append(category[i][0])
        except:
            continue
    y_raw.append(y_topics)

# Assign raw texts to X
X_raw = body

In [375]:
# Calculate number of times each topic referenced
counts = {}
for elem in y_raw:
    for topic in elem:
        if topic not in counts:
            counts[topic] = 1
        else:
            counts[topic] += 1
            
# Create list of topics referenced fewer than 10 times             
topics = []
for topic in counts:
    if counts[topic] < 10:
        topics.append(topic)

# Remove infrequently mentioned topics from y
for topic in topics:
    for elem in y_raw:
        if topic in elem:
            elem.remove(topic)

# Remove 'articles' that have no body and no labels            
for i, label in enumerate(y_raw):
    if label == [] and X_raw[i] == '':
        del y_raw[i]
        del X_raw[i]
            
assert len(X_raw) == len(y_raw)

### Vectorizing and binarizing

In [376]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(X_raw)

In [377]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y_raw)

### Train/test split

In [378]:
# Train/test split
from sklearn.model_selection import train_test_split
X, X_test, y, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
assert X_test.shape[0] == y_test.shape[0]

## Probabilistic model

In [379]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier

prob_clf = MultiOutputClassifier(DecisionTreeClassifier(min_samples_split=20, max_depth=50)).fit(X, y)

In [380]:
from sklearn import metrics
predictions = prob_clf.predict(X_test)
print('Precision:', metrics.precision_score(y_test, predictions, average='micro'))
print('Recall:', metrics.recall_score(y_test, predictions, average='micro'))
print('F-score:', metrics.f1_score(y_true=y_test, y_pred=predictions, average='micro'))

Precision: 0.7827949901120633
Recall: 0.7527733755942948
F-score: 0.7674907093229923


## Non-probabilistic model

In [381]:
from sklearn.linear_model import PassiveAggressiveClassifier

non_prob_clf = MultiOutputClassifier(PassiveAggressiveClassifier(loss='squared_hinge')).fit(X, y)

In [382]:
from sklearn import metrics
predictions = non_prob_clf.predict(X_test)
print('Precision:', metrics.precision_score(y_test, predictions, average='micro'))
print('Recall:', metrics.recall_score(y_test, predictions, average='micro'))
print('F-score:', metrics.f1_score(y_true=y_test, y_pred=predictions, average='micro'))

Precision: 0.8903588256614715
Recall: 0.7786053882725832
F-score: 0.8307406154886708


## 20 Newsgroups

In [1]:
from sklearn.datasets import fetch_20newsgroups_vectorized

# Training set
newsgroups_train = fetch_20newsgroups_vectorized(subset='train')
X = newsgroups_train['data']
y = newsgroups_train['target']

# Test set
newsgroups_test = fetch_20newsgroups_vectorized(subset='test')
X_test = newsgroups_test['data']
y_test = newsgroups_test['target']

In [3]:
print(X[:10], y[:10])

  (0, 5022)	0.017109647770728872
  (0, 5886)	0.017109647770728872
  (0, 6214)	0.017109647770728872
  (0, 6216)	0.017109647770728872
  (0, 6281)	0.017109647770728872
  (0, 6286)	0.017109647770728872
  (0, 6324)	0.017109647770728872
  (0, 6331)	0.017109647770728872
  (0, 6403)	0.017109647770728872
  (0, 11391)	0.017109647770728872
  (0, 13930)	0.017109647770728872
  (0, 15094)	0.017109647770728872
  (0, 15251)	0.017109647770728872
  (0, 15530)	0.017109647770728872
  (0, 16731)	0.017109647770728872
  (0, 20228)	0.017109647770728872
  (0, 26214)	0.017109647770728872
  (0, 26806)	0.017109647770728872
  (0, 27436)	0.017109647770728872
  (0, 27618)	0.017109647770728872
  (0, 27645)	0.017109647770728872
  (0, 27901)	0.017109647770728872
  (0, 28012)	0.05132894331218662
  (0, 28146)	0.41063154649749295
  (0, 28421)	0.034219295541457743
  :	:
  (9, 96162)	0.0842151921066519
  (9, 97133)	0.0842151921066519
  (9, 100721)	0.0842151921066519
  (9, 100796)	0.0842151921066519
  (9, 102607)	0.084215192

### Probabilistic model

In [387]:
prob_clf.estimator.fit(X, y)
print('Accuracy:', prob_clf.estimator.score(X_test, y_test))

Accuracy: 0.50570897503983


### Non-probabilistic model

In [388]:
non_prob_clf.estimator.fit(X, y)
print('Accuracy:', non_prob_clf.estimator.score(X_test, y_test))

Accuracy: 0.8141263940520446
