In [2]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('data/tagging/categories.csv')#.loc[lambda x: x.short_description.str.len() > 10]

FileNotFoundError: File b'data/tagging/categories.csv' does not exist

The task is to predict not one but multiple categories for each observation. 
One of the solutions is to create a binary classifier for each unique category.
It is fairly simple to do using scikit-learn but we need to create our own classifier.

Task:

1. Write a custom classifier to solve this.
2. Evaluate its results (what measure could be good for comparing sets?)


Hints:

You can keep your classifiers in a dictionary `class name -> Classifier`
Both in fit and predict you need to iterate over all unique classes.

In [80]:
import ast

class OneVsRestClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimator):
        self.base_estimator = base_estimator
        self.estimators = {}

    def fit(self, X, y=None, sample_weight=None):
        self.classes = self._extract_classes(y)
        print("Fitting")
        for cl in self.classes:
            est_ = clone(self.base_estimator)
            est_.fit(X, self._isin(y, cl))
            self.estimators[cl] = est_
        return self

    def predict(self, X):
        print("Predicting")
        outputs = [[] for _ in range(X.shape[0])]
        for cl in self.classes:
            true_indices = np.where(self.estimators[cl].predict(X) == 1)[0]
            for i in true_indices:
                outputs[i].append(cl)
        return outputs
    
    def _extract_classes(self, y_l):
        y_l = [ast.literal_eval(x) for x in y]
        classes = np.unique([item for sub in y_l for item in sub])
        return classes
    
    def _isin(self, y, cl):
        y_l = [ast.literal_eval(x) for x in y]
        return np.array([cl in item for item in y_l], dtype=np.int)
        

#    def _isin(self, ys, cl):
#        return np.array([cl in y for y in ys], dtype=np.int)
    
    
def f1(true, pred):
    if len(pred) == 0:
        return 0
    tp = len(set(true).intersection(set(pred)))
    precision = tp / len(pred)
    recall = tp / len(true)
    if (precision + recall) == 0:
        return 0
    else:
        return (2 * precision * recall / (precision + recall))
    
    
def model_definition_words() -> Pipeline:
    est = make_pipeline(
        CountVectorizer(min_df=5, binary=True, analyzer='word'),
        OneVsRestClassifier(base_estimator=RandomForestClassifier(n_estimators=100, min_samples_leaf=10, min_samples_split=20,
                                                        n_jobs=-2))
    )
    return est


def validate_model_multiple_outputs():
    print('Loading data')
    X, y = load_data()
    X_tr, X_te, y_tr, y_te = train_test_split(X[:10000], y[:10000], random_state=1)
    est = model_definition_words()
    est.fit(X_tr, y_tr)
    preds = est.predict(X_te)
    mean_f1 = np.array([f1(true, pred) for true, pred in zip(y_te, preds)]).mean()
    print("Multiple Labels F1", mean_f1)
    
#validate_model_multiple_outputs()


In [85]:
X_tr, X_te, y_tr, y_te = train_test_split(X[:10000], y[:10000], random_state=1)
est = model_definition_words()
est.fit(X_tr, y_tr)
preds = est.predict(X_te)


Fitting
Predicting


In [84]:
def isin(y_l, cl):
    #y_l = [ast.literal_eval(x) for x in y]
    return np.array([cl in item for item in y_l], dtype=np.int)


est = make_pipeline(
    CountVectorizer(min_df=5, binary=True, analyzer='word'),
    RandomForestClassifier())


y_l = [ast.literal_eval(x) for x in y_tr]
est.fit(X_tr, isin(y_l, 'Consulting'))
est.predict(X_tr)

array([1, 0, 0, ..., 0, 0, 0])

In [79]:
y_l = [ast.literal_eval(x) for x in y_tr]
classes = np.unique([item for sub in y_l for item in sub])
for cl in classes:
    est_ = clone(RandomForestClassifier())
    est_.fit(X_tr, isin(y, cl))
#    self.estimators[cl] = est_


array(['3D Printing', '3D Technology', 'Accounting', 'Ad Exchange',
       'Ad Network', 'Ad Targeting', 'Advanced Materials',
       'Adventure Travel', 'Advertising', 'Advertising Exchanges',
       'Advertising Platforms', 'Advice', 'Aerospace',
       'Affiliate Marketing', 'AgTech', 'Agriculture',
       'Air Transportation', 'Alternative Medicine', 'Alumni',
       'Analytics', 'Android', 'Angel Investment', 'Animal Feed',
       'Animation', 'App Discovery', 'App Marketing',
       'Application Performance Management', 'Application Platforms',
       'Apps', 'Architecture', 'Art', 'Artificial Intelligence',
       'Asset Management', 'Auctions', 'Audio', 'Augmented Reality',
       'Auto Insurance', 'Automotive', 'Autonomous Vehicles', 'B2B',
       'B2C', 'Baby', 'Banking', 'Battery', 'Beauty', 'Big Data',
       'Big Data Analytics', 'Bike', 'Billing', 'Biometrics', 'Biopharma',
       'Biotechnology', 'Bitcoin', 'Blogging Platforms', 'Boating',
       'Brand Marketing', 'Brew

In [70]:
'Consulting' in m[0]

True

In [66]:
['Consulting' in item for item in m]

[True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 F

In [32]:
X, y = load_data()
X_tr, X_te, y_tr, y_te = train_test_split(X[:10000], y[:10000], random_state=1)
est = model_definition_words()
est.fit(X_tr, y_tr)
preds = est.predict(X_te)


                                     website  \
0                           http://iFans.com   
1                    http://www.braingig.com   
2                 https://www.twinelabs.com/   
3                   http://www.SumaGreen.com   
4  http://worldstartupreport.strikingly.com/   

                                         categories  \
0                                          ['News']   
1                         ['Non Profit', 'Finance']   
2                                                []   
3                                 ['Biotechnology']   
4  ['Market Research', 'CleanTech', 'Clean Energy']   

                                   short_description  
0  iFans is a community-based forum and portal th...  
1              Connecting grant funders and seekers.  
2  Twine is a powerful platform for internal mobi...  
3  SumaGreen is an agro firm committed to enablin...  
4  World Startup Report is a social mission to do...  
Fitting
Predicting


In [41]:

np.array(['a' in x for x in y_te])

array([ True,  True, False, ...,  True,  True,  True])

In [34]:
np.array([cl in y for y in ys], dtype=np.int)

array([0.38095238, 0.625     , 0.66666667, ..., 0.13888889, 0.08695652,
       0.43636364])

In [35]:
def f1(true, pred):
    if len(pred) == 0:
        return 0
    tp = len(set(true).intersection(set(pred)))
    precision = tp / len(pred)
    recall = tp / len(true)
    if (precision + recall) == 0:
        return 0
    else:
        return (2 * precision * recall / (precision + recall))


array(["['SaaS', 'Computer', 'Software']", "['Finance']", '[]', ...,
       "['Fitness', 'Health Care', 'Lifestyle', 'Gamification', 'Software']",
       "['Electrical Distribution', 'Electronics', 'Test and Measurement']",
       "['E-Commerce', 'Electronics', 'Industrial']"], dtype=object)

In [40]:
pred[]

"['SaaS', 'Computer', 'Software']"

In [39]:
set(y_te[0])

{' ',
 "'",
 ',',
 'C',
 'S',
 '[',
 ']',
 'a',
 'e',
 'f',
 'm',
 'o',
 'p',
 'r',
 't',
 'u',
 'w'}

In [31]:
#df.groupby('categories').size()

<a>Show answers</a>

<div class='spoiler'>
class OneVsRestClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimator):
        self.base_estimator = base_estimator
        self.estimators = {}

    def fit(self, X, y=None, sample_weight=None):
        self.classes = list(set(chain(*y)))
        print("Fitting")
        for cl in tqdm(self.classes):
            est_ = clone(self.base_estimator)
            est_.fit(X, self._isin(y, cl))
            self.estimators[cl] = est_
        return self

    def predict(self, X):
        print("Predicting")
        outputs = [[] for _ in range(X.shape[0])]
        for cl in tqdm(self.classes):
            true_indices = np.where(self.estimators[cl].predict(X) == 1)[0]
            for i in true_indices:
                outputs[i].append(cl)
        return outputs

    def _isin(self, ys, cl):
        return np.array([cl in y for y in ys], dtype=np.int)
    
    
def f1(true, pred):
    if len(pred) == 0:
        return 0
    tp = len(set(true).intersection(set(pred)))
    precision = tp / len(pred)
    recall = tp / len(true)
    if (precision + recall) == 0:
        return 0
    else:
        return (2 * precision * recall / (precision + recall))
    
    
def model_definition_words() -> Pipeline:
    est = make_pipeline(
        CountVectorizer(min_df=5, binary=True, analyzer='word'),
        OneVsRestClassifier(base_estimator=RandomForestClassifier(n_estimators=100, min_samples_leaf=10, min_samples_split=20,
                                                        n_jobs=-2))
    )
    return est


def validate_model_multiple_outputs():
    print('Loading data')
    X, y = load_data()
    X_tr, X_te, y_tr, y_te = train_test_split(X[:10000], y[:10000], random_state=1)
    est = model_definition_words()
    est.fit(X_tr, y_tr)
    preds = est.predict(X_te)
    mean_f1 = np.array([f1(true, pred) for true, pred in zip(y_te, preds)]).mean()
    print("Multiple Labels F1", mean_f1)
    
validate_model_multiple_outputs()
</div>