In [22]:
import nltk
import json
import pandas as pd
from collections import defaultdict

In [2]:
nltk.download('reuters')

[nltk_data] Downloading package reuters to /Users/gautham/nltk_data...


True

In [3]:
from nltk.corpus import reuters

In [10]:
labels = reuters.categories()
print('Number of categories - ', len(labels))

Number of categories -  90


> There are **90** different categories in the dataset

## Prepare a DataFrame

In [11]:
doc_ids = reuters.fileids()

In [14]:
df_list = []
for doc_id in doc_ids:
    df_dict = {}
    df_dict['text'] = reuters.raw(doc_id)
    df_dict['labels'] = reuters.categories(doc_id)
    df_list.append(df_dict)
df = pd.DataFrame(df_list)

In [15]:
df

Unnamed: 0,text,labels
0,ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...,[trade]
1,CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...,[grain]
2,JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...,"[crude, nat-gas]"
3,THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...,"[corn, grain, rice, rubber, sugar, tin, trade]"
4,INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...,"[palm-oil, veg-oil]"
...,...,...
10783,U.K. MONEY MARKET SHORTAGE FORECAST REVISED DO...,"[interest, money-fx]"
10784,KNIGHT-RIDDER INC &lt;KRN> SETS QUARTERLY\n Q...,[earn]
10785,TECHNITROL INC &lt;TNL> SETS QUARTERLY\n Qtly...,[earn]
10786,NATIONWIDE CELLULAR SERVICE INC &lt;NCEL> 4TH ...,[earn]


In [60]:
df.to_csv('multi_label_df.csv', index=False)

## Analysis

In [16]:
df['num_labels'] = df['labels'].apply(lambda x: len(x))

In [20]:
value_counts_df = df['num_labels'].value_counts().reset_index()
value_counts_df

Unnamed: 0,num_labels,count
0,1,9160
1,2,1173
2,3,255
3,4,91
4,5,52
5,6,27
6,7,9
7,8,7
8,9,5
9,10,3


In [21]:
value_counts_df[value_counts_df['num_labels']==1]['count']/value_counts_df['count'].sum()

0    0.849092
Name: count, dtype: float64

> We see that around **85 percent** of the data has single label

Assumptions
1. Can be treated as a Multi class single label classification problem for simplicity
2. Can be Treated as a multi label classification problem

## Data Preparation

For the classes that has multiple labels, convert it into a single label by using one of the approximation techniques below
1. Simplest way to do this would be to take the first label as the primary label for the record
2. We can choose to approximate this by choosing the label which occurs most in the dataset. This makes the learning task simpler for the model
3. We can choose the labels to represet the least occuring label to add support to minority class. This can make the learning task difficult

> By converting the multi label to single label, we may lose valuable information and a well generelized model may behave unpredictably for the records which originally had multiple labels. But since this exercise is to demonstrate technical capabilities, continuing with the approximation

In [25]:
label_counts = defaultdict(lambda: 0)
for i in range(len(df)):
    labels = df['labels'].iloc[i]
    for label in labels:
        label_counts[label] += 1

In [32]:
sorted_label_counts = dict(sorted(label_counts.items(), key=lambda item: item[1], reverse=True))
sorted_label_keys = [item for item in sorted_label_counts]

In [34]:
label_rank_lookup = {key: rank + 1 for rank, key in enumerate(sorted_label_keys)}

In [38]:
def get_most_occuring_label(label_list, lookup = label_rank_lookup):
    rank = 100
    return_value  = None
    for item in label_list:
        if lookup[item]< rank:
            rank = lookup[item]
            return_value = item
    return return_value

In [45]:
single_label_df = df.copy(deep=True)

In [47]:
single_label_df['top_label'] = single_label_df['labels'].apply(lambda x: get_most_occuring_label(x))

In [48]:
single_label_df

Unnamed: 0,text,labels,num_labels,top_label
0,ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...,[trade],1,trade
1,CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...,[grain],1,grain
2,JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...,"[crude, nat-gas]",2,crude
3,THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...,"[corn, grain, rice, rubber, sugar, tin, trade]",7,grain
4,INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...,"[palm-oil, veg-oil]",2,veg-oil
...,...,...,...,...
10783,U.K. MONEY MARKET SHORTAGE FORECAST REVISED DO...,"[interest, money-fx]",2,money-fx
10784,KNIGHT-RIDDER INC &lt;KRN> SETS QUARTERLY\n Q...,[earn],1,earn
10785,TECHNITROL INC &lt;TNL> SETS QUARTERLY\n Qtly...,[earn],1,earn
10786,NATIONWIDE CELLULAR SERVICE INC &lt;NCEL> 4TH ...,[earn],1,earn


In [55]:
label_count_df = single_label_df['top_label'].value_counts().reset_index()
filter_labels = label_count_df[label_count_df['count']>50]['top_label']

In [57]:
filtered_single_label_df = single_label_df[single_label_df['top_label'].isin(filter_labels)]

In [59]:
filtered_single_label_df.to_csv('single_label_df.csv', index=False)

## Multi Label Data Preparation

In [73]:
multi_df = df.copy(deep=True)

In [74]:
multi_label_count = defaultdict(lambda: 0)

In [75]:
for i in range(len(multi_df)):
    labels = multi_df['labels'].iloc[i]
    for label in labels:
        multi_label_count[label] += 1

In [76]:
filtered_multi_label_dict = {}
for key, value in multi_label_count.items():
    if value > 50:
        filtered_multi_label_dict[key] = value

In [78]:
multi_df['filtered_labels'] = None
for i in range(len(multi_df)):
    multi_df['filtered_labels'].iloc[i] = [item for item in multi_df['labels'].iloc[i] if item in filtered_multi_label_dict.keys() ]

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [80]:
multi_df.to_csv('multi_label_df.csv', index=False)

In [81]:
filtered_multi_label_dict.keys()

dict_keys(['trade', 'grain', 'crude', 'nat-gas', 'corn', 'rice', 'sugar', 'veg-oil', 'ship', 'coffee', 'wheat', 'gold', 'acq', 'interest', 'money-fx', 'copper', 'ipi', 'carcass', 'livestock', 'oilseed', 'soybean', 'earn', 'bop', 'gas', 'jobs', 'cpi', 'gnp', 'dlr', 'yen', 'cocoa', 'cotton', 'money-supply', 'iron-steel', 'alum', 'reserves', 'barley'])