Sometimes your input data can be nested with more difficult structure than a simple table or a matrix.

In such cases it is sometime useful to shift mental orientation to analyze and extract information froms rows rather then non-defined columns.

In [1]:
from utils import css_from_file
css_from_file('style/style.css')

In [2]:
!pip install nltk



In [3]:
import json
import numpy as np
import pprint
from nltk import download, word_tokenize

download('punkt')

[nltk_data] Downloading package punkt to /home/diego/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
with open("data/companies/companies.json") as dataf:
    data = [json.loads(line) for line in dataf]

An example of deeply nested data with various data types:

Exercise:

1. Name variable types
2. What do you do with lists, geo location?
3. What do you do with counts?

In [34]:
list(filter(lambda x: x['industries'] == ['Information Technology and Services'], data))

[{'description': '2add develops and deploys custom made software. Main tool used is Visual Studio / VB. Crystal reports and solutions in Excel en MS Access also belong to the portfolio. 2add also advises in IT projects and can tako on management in software projects. As an additional service the supply, installation and maintenance of IT infra structures for small companies are offered as well as the creation of WEB sites. 2add is convinced that for most problems the best current software solution is a SaaS or Intranet applications. It releases from all major hardware issues: back up, installation and maintenance, updates and scaling. Further it reduces costs and improves manageability and in most cases offers short learning curves',
  'domain': '2add.nl',
  'extension': {'address': 'Nijmegen Area, Netherlands',
   'geo_location': {'country': 'The Netherlands',
    'formatted_address': 'Nijmegen, Netherlands',
    'location': [52.2379891, 5.53460738161551],
    'raw': 'Nijmegen Area, N

In [6]:
pprint.pprint(data[7])

{'address': {'city': 'Seattle',
             'country': 'United States',
             'postalCode': '98134',
             'raw': '624 South Lander St\n'
                    'Suite 28\n'
                    'Seattle,\n'
                    'WA\n'
                    '98134\n'
                    'United States',
             'region': 'WA',
             'street': '624 South Lander St'},
 'description': 'At 36th avenue design|build we are committed to total client '
                'satisfaction. We believe that strong and lasting '
                'relationships built on integrity and trust, earned through '
                'the remodel, is as important as the renovation of your home. '
                'We consider every project an opportunity to participate with '
                'our clients in a unique and artful, design and construction '
                'process. To each of our clients, our commitment remains '
                'consistent: concise communication, integrity, and prid

In [7]:
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
import copy
class SkillFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, x, y = None):
        return self

    def transform(self, x):
        return_list = []
        for company in x:
            dictionary = {}
            
            extension = company.get('extension', [])
            if extension:
                entry_skills = extension.get('skills', [])
                for entry_skill in entry_skills:
                    dictionary[entry_skill['skill']] = entry_skill['count']
            return_list.append(dictionary)
        return return_list



class SelectorFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
        
    def fit(self, x, y = None):
        return self

    def transform(self, x):
        return_list = []
        for company in x:
            dictionary = {}
            
            values = company.get(self.column, [])
            for value in values:
                dictionary[value] = 1
            return_list.append(dictionary)
        return return_list
    
tt = SkillFeatures() 
print(tt.transform(data[:100]))

[{'Customer Service': 1, 'Leadership': 1, 'Training': 1, 'Business Planning': 1, 'Sales': 1, 'Team Building': 1, 'Management': 1, 'Account Management': 1, 'Marketing': 1, 'Business Development': 1, 'New Business Development': 1, 'Negotiation': 1, 'Sales Management': 1}, {'Marketing': 3, 'Photography': 3, 'Social Media': 3, 'Adobe Creative Suite': 2, 'Social Media Marketing': 2, 'Marketing Communications': 2, 'Microsoft Office': 2, 'Public Relations': 2, 'Event Management': 2, 'Marketing Strategy': 2, 'Fine Art': 2, 'InDesign': 2, 'Customer Service': 1, 'Oil Painting': 1, 'Strategy': 1, 'Freelance Graphics': 1, 'Mac': 1, 'Coaching': 1, 'News Writing': 1, 'Advertising': 1, 'iMovie': 1, 'sports': 1, 'Creative Writing': 1, 'Management': 1, 'Internal Communications': 1, 'Painting': 1, 'Drawing': 1, 'Contemporary Art': 1, 'Graphic Design': 1, 'Digital Marketing': 1, 'Integrated Marketing': 1, 'Copy Editing': 1, 'Magazines': 1, 'Sales': 1, 'Photoshop': 1, 'Art': 1, 'Sports Writing': 1, 'Digit

In [8]:
class ExtractFeature():
    def __init__(self, column):
        self.column = column
        
    def fit(self, x, y = None):
        return self

    def transform(self, x):
        return_list = []
        for company in x:
            value = company[self.column]       
            return_list.append(value)
        return return_list

In [9]:
class ExtractFirstFeature():
    def __init__(self, column):
        self.column = column
        
    def fit(self, x, y = None):
        return self

    def transform(self, x):
        return_list = []
        for company in x:
            value_l = company[self.column]       
            return_list.append(value_l[0] if len(value_l) > 0 else '')
        return return_list

In [10]:
industry_selector = ExtractFirstFeature('industries')

In [9]:
industry_selector.transform(data[:100])

['Real Estate',
 'Marketing and Advertising',
 'Real Estate',
 'Internet',
 'Information Technology and Services',
 'Computer Software',
 'Information Technology and Services',
 'Construction',
 'Package/Freight Delivery',
 'Marketing and Advertising',
 'Computer Games',
 'Internet',
 'Construction',
 'Insurance',
 'International Trade and Development',
 'Information Technology and Services',
 'Law Practice',
 'Marketing/Reklama/Public Relations',
 'Investment Management',
 'Automotive',
 'Chemicals',
 'Health, Wellness and Fitness',
 'Facilities Services',
 'Research',
 'Computer Software',
 'Construction',
 '',
 'Staffing and Recruiting',
 'Printing',
 'Accounting',
 'Design',
 'Computer Software',
 'Health, Wellness and Fitness',
 'Venture Capital & Private Equity',
 'Information Technology and Services',
 'Accounting',
 'Marketing and Advertising',
 'Civic & Social Organization',
 'Think Tanks',
 'Nonprofit Organization Management',
 'Hospitality',
 'Higher Education',
 'Banking',


In [63]:


from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
pipeline = make_union (

    make_pipeline(SelectorFeatures('technologies') , DictVectorizer()),
    #make_pipeline(SelectorFeatures('specialties'), DictVectorizer()),
    make_pipeline(SkillFeatures(),   DictVectorizer()),
    make_pipeline(ExtractFeature('description'), CountVectorizer(min_df=2, max_df=0.8, ngram_range=(1,2)), preprocessor=clean_text2, stop_words=stop_words, 
                                   tokenizer=tok_with_stemmer)
)


TypeError: Unknown keyword arguments: "preprocessor"

In [114]:
Xtr.shape

(10000, 191419)

In [11]:
# write sparsity class here
class SparsityFilter(BaseEstimator, TransformerMixin):
    def __init__(self, min_nnz=None):
        self.min_nnz = min_nnz

    def fit(self, X, y=None):
        self.sparsity = X.getnnz(0)
        return self

    def transform(self, X):
        return X[:, self.sparsity >= self.min_nnz]

In [35]:

import snowballstemmer
import re
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stemmer = snowballstemmer.stemmer('english')

from nltk.tokenize import word_tokenize

def clean_text2(str):
    str = str.lower()
    str = re.sub(r'\b[0-9]+[A-Za-z]+\b', '', str)
    str = re.sub(r'\b[0-9]+\b', '', str)
    str = re.sub(r'\b[A-Za-z]+[0-9]+\b', '', str)
    return str

def stemmed_words(doc):
    return [stemmer.stemWords(w) for w in doc]

def tok_with_stemmer(doc):
    tokenized =  word_tokenize(doc)
    #return tokenized
    return stemmer.stemWords(tokenized)
    

from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
pipeline = make_pipeline(
    make_union (

   #    make_pipeline(SelectorFeatures('technologies') , DictVectorizer()),
   # make_pipeline(SelectorFeatures('specialties'), DictVectorizer()),
    #make_pipeline(SkillFeatures(),   DictVectorizer()),
    make_pipeline(ExtractFeature('description'), CountVectorizer(min_df=1, max_df=0.8, ngram_range=(1,2), preprocessor=clean_text2, stop_words=stop_words, 
                                   tokenizer=tok_with_stemmer))

    ), 
    SparsityFilter(5),
    SGDClassifier()
)
 




In [36]:
X, y = data, industry_selector.transform(data)

In [37]:
Xtr = pipeline.fit_transform(X)

ValueError: bad input shape ()

In [120]:
Xtr.shape

(10000, 191419)

In [104]:
pipeline.fit(X,y)

Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('selectorfeatures', SelectorFeatures(column='technologies')), ('dictvectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True))])),...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))])

In [None]:
from sklearn.cross_validation import cross_val_predict
preds = cross_val_predict(pipeline, 
                          X, 
                          y, 
                          cv=5, n_jobs=-1, verbose=True)
from sklearn.metrics import classification_report
print(classification_report(y, preds))

In [27]:
print(classification_report(y, preds))

                                        precision    recall  f1-score   support

                                             0.56      0.45      0.50       186
                            Accounting       0.66      0.40      0.49       106
                     Airlines/Aviation       0.32      0.24      0.28        37
        Alternative Dispute Resolution       0.00      0.00      0.00         8
                  Alternative Medicine       0.00      0.00      0.00         7
                             Animation       0.12      0.18      0.14        11
                     Apparel & Fashion       0.26      0.19      0.22        95
               Architecture & Planning       0.76      0.40      0.52       132
                          Architektura       0.00      0.00      0.00         2
                       Arts and Crafts       0.33      0.09      0.14        34
                            Automotive       0.71      0.51      0.60       147
                  Aviation & Aerospace 

  'precision', 'predicted', average, warn_for)


In [None]:
from sklearn.linear_model import SGDClassifier

In [28]:
unique, counts = np.unique(y, return_counts=True)
count_dict = {}
for u, c in zip(unique, counts):
    count_dict[u] = c
count_dict
excl_cat = [x[0] for x in count_dict.items() if x[1] < 10 or len(x[0]) == 0 ]
excl_cat

['',
 'Alternative Dispute Resolution',
 'Alternative Medicine',
 'Architektura',
 'Budownictwo/Geodezja',
 'Capital Markets',
 'Dairy',
 'Doradztwo/Konsulting',
 'Executive Office',
 'Finanse/Ekonomia',
 'Fishery',
 'Gambling & Casinos',
 'Grafika/Kreacja artystyczna/Fotografia',
 'Hotelarstwo/Turystyka/Katering',
 'Informatyka/Programowanie',
 'Inne',
 'Instalacja/Utrzymanie/Serwis',
 'Internet/E-Commerce',
 'Judiciary',
 'Kontrola jakości/BHP',
 'Libraries',
 'Logistyka/Transport/Dystrybucja',
 'Marketing/Reklama/Public Relations',
 'Media/Sztuka/Rozrywka',
 'Medycyna',
 'Military',
 'Museums and Institutions',
 'Nanotechnology',
 'Organizacje pozarządowe/Wolontariat',
 'Package/Freight Delivery',
 'Political Organization',
 'Prawo',
 'Produkcja',
 'Public Safety',
 'Ranching',
 'Rolnictwo/Ochrona środowiska',
 'Semiconductors',
 'Sprzedaż',
 'Supermarkets',
 'Szkolenia/Edukacja',
 'Think Tanks',
 'Tobacco',
 'Warehousing',
 'Wireless',
 'Zakupy']

In [29]:
rows_to_keeps = [i for i, e in enumerate(y) if e not in excl_cat ]
yt = [e for i, e in enumerate(y) if i  in rows_to_keeps]
Xt = [e for i, e in enumerate(X) if i   in rows_to_keeps]
yt2 = industry_selector.transform(Xt)

In [30]:
len(Xt), len(yt)

(9634, 9634)

In [108]:
yt

['Real Estate',
 'Marketing and Advertising',
 'Real Estate',
 'Internet',
 'Information Technology and Services',
 'Computer Software',
 'Information Technology and Services',
 'Construction',
 'Marketing and Advertising',
 'Computer Games',
 'Internet',
 'Construction',
 'Insurance',
 'International Trade and Development',
 'Information Technology and Services',
 'Law Practice',
 'Investment Management',
 'Automotive',
 'Chemicals',
 'Health, Wellness and Fitness',
 'Facilities Services',
 'Research',
 'Computer Software',
 'Construction',
 '',
 'Staffing and Recruiting',
 'Printing',
 'Accounting',
 'Design',
 'Computer Software',
 'Health, Wellness and Fitness',
 'Venture Capital & Private Equity',
 'Information Technology and Services',
 'Accounting',
 'Marketing and Advertising',
 'Civic & Social Organization',
 'Nonprofit Organization Management',
 'Hospitality',
 'Higher Education',
 'Banking',
 'Apparel & Fashion',
 'Consumer Services',
 'Media Production',
 'Utilities',
 'Hum

In [56]:
X2 = X
def yvv(yv):
    return yv if yv not in excl_cat else 'Other'
Y2 = [yvv(yv) for yv in y ]

In [100]:
yt2 == yt

True

In [57]:
len(X2), len(Y2)

(10000, 10000)

In [58]:
Y2

['Real Estate',
 'Marketing and Advertising',
 'Real Estate',
 'Internet',
 'Information Technology and Services',
 'Computer Software',
 'Information Technology and Services',
 'Construction',
 'Other',
 'Marketing and Advertising',
 'Computer Games',
 'Internet',
 'Construction',
 'Insurance',
 'International Trade and Development',
 'Information Technology and Services',
 'Law Practice',
 'Other',
 'Investment Management',
 'Automotive',
 'Chemicals',
 'Health, Wellness and Fitness',
 'Facilities Services',
 'Research',
 'Computer Software',
 'Construction',
 '',
 'Staffing and Recruiting',
 'Printing',
 'Accounting',
 'Design',
 'Computer Software',
 'Health, Wellness and Fitness',
 'Venture Capital & Private Equity',
 'Information Technology and Services',
 'Accounting',
 'Marketing and Advertising',
 'Civic & Social Organization',
 'Other',
 'Nonprofit Organization Management',
 'Hospitality',
 'Higher Education',
 'Banking',
 'Apparel & Fashion',
 'Consumer Services',
 'Media Pr

In [31]:
from sklearn.cross_validation import cross_val_predict
preds = cross_val_predict(pipeline, 
                          Xt, 
                          yt, 
                          cv=5, n_jobs=-1, verbose=True)
from sklearn.metrics import classification_report
print(classification_report(yt, preds))

[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   40.1s remaining:  1.0min


                                      precision    recall  f1-score   support

                          Accounting       0.46      0.49      0.47       106
                   Airlines/Aviation       0.32      0.38      0.35        37
                           Animation       0.05      0.27      0.09        11
                   Apparel & Fashion       0.30      0.27      0.29        95
             Architecture & Planning       0.69      0.48      0.57       132
                     Arts and Crafts       0.07      0.06      0.06        34
                          Automotive       0.65      0.55      0.60       147
                Aviation & Aerospace       0.23      0.10      0.14        31
                             Banking       0.20      0.48      0.29        42
                       Biotechnology       0.30      0.17      0.22        47
                     Broadcast Media       0.13      0.09      0.11        33
                  Building Materials       0.20      0.10      

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   42.1s finished


With such data you can be sure that you'll need a sparse matrix.

Remember `DictVectorizer` class? It accepts a dictionary and returns a sparse matrix.

So the only thing we need is a function such that 

```f(Json) => Dict```

First we need a function `deep_select` to retrieve nested values

In [45]:
matrix.todense().shape

(100, 1232)

In [8]:
### write your deep_select class here

jsondata = {'a': 
    {'b': 
         {'c': 1}
    }
}

class DeepSelect():
    def __init__(self, path, default):
        self.path = path
        self.default = default
    
    def _helper(self, row, path):
        if len(path) == 1:
            return row.get(path[0], self.default)
        elif path[0] in row:
            return self._helper(row[path[0]], path[1:])
        else:
            return self.default

    def __call__(self, row):
        return self._helper(row, self.path)

# tests
assert DeepSelect(["a","b","c"], None)(jsondata) == 1
assert DeepSelect(["a","b"], None)(jsondata) == {'c': 1}
assert DeepSelect(["x"], None)(jsondata) == None

Click here to see the DeepSelect solution
<div class="spoiler">

class DeepSelect():
    def __init__(self, path, default):
        self.path = path
        self.default = default
    
    def _helper(self, row, path):
        if len(path) == 1:
            return row.get(path[0], self.default)
        elif path[0] in row:
            return self._helper(row[path[0]], path[1:])
        else:
            return self.default

    def __call__(self, row):
        return self._helper(row, self.path)

</a>

In [None]:
def create_skills_features(row):
    features = {}
    for tech in DeepSelect(['extension','skills'],[])(row):
        features[tech['skill']] = tech["count"]
    return features

create_skills_features(data[4])

So far so good what about text?

In [None]:
def create_description_features(row):
    features = {}
    for word in word_tokenize(row['description']):
        features["description=" + word.lower()] = 1
    return features

create_description_features(data[0])

Let's create a more generic way to transform text

In [None]:
class TransformText():
    def __init__(self, field, tokenizer=word_tokenize):
        self.field = field
        self.tokenizer = tokenizer
    
    def __call__(self, row):
        features = {}
        for word in self.tokenizer(DeepSelect(self.field,"")(row)):
            word = word.lower()
            features[word] = 1
        return features
    
text_transformer = TransformText(['extension','address'], tokenizer=lambda x: [x])
text_transformer(data[6])

Exercise:
-------------
    
1. Write function or classes that transform other features? You'll need a function to retrieve nested values. 
2. There are some fields which you can treat as a categorical feature or a text features. What is best and why?
3. Write a function / class that will accept a list of transforming functions and creates a concatenation of the features
4. Wrap previous function in a scikit-learn transformer class so we can use it in a pipeline

In [None]:
def combine_features(fs):

    def helper(row):
        all_features = {}
        for name, f in fs:
            for k,v in f(row).items():
                all_features[name + "_" + k] = v
        return all_features

    return helper

company_data = {
    'description': 'Fortune 500 hundred company',
    'skills': [{'skill': 'waste disposal', 'count': 1},
               {'skill': 'data science', 'count': 2}]
}


features_generator = combine_features([('description', TransformText(['description'])),
                                       ('skills', create_skills_features)]) 

features_generator(company_data)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer

class JsonTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, fs):
        self.fs = fs
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        out = []
        for x in X:
            out.append(combine_features(self.fs)(x))
        return out
    
pipeline = make_pipeline(
    JsonTransformer([('description', TransformText(['description'])), 
                     ('skills', create_skills_features)]),
    DictVectorizer()
)

pipeline.fit_transform(data)

In [None]:
X = pipeline.fit_transform(data)
X.shape

Number of features is really high. We need to reduce this. We can remove too sparse values.
To check the sparsity of the data we can use a method

X.getnnz (number of non-zero values)

In [None]:
X.getnnz(0)

Exercise
===============

1. Write a transformation class called SparsityFilter that accepts a minimum frequency. Watch out for fit function - this class has some state that you must save

```
class SparsityFilter(BaseEstimator, TransformerMixin):
    def __init__(self, min_nnz=None):
        self.min_nnz = min_nnz

    def fit(self, X, y=None):
        ???
        return self

    def transform(self, X):
        return ???
```

In [None]:
# write sparsity class here
class SparsityFilter(BaseEstimator, TransformerMixin):
    def __init__(self, min_nnz=None):
        self.min_nnz = min_nnz

    def fit(self, X, y=None):
        self.sparsity = X.getnnz(0)
        return self

    def transform(self, X):
        return X[:, self.sparsity >= self.min_nnz]

Double click to see the solution 

<div class="spoiler">

class SparsityFilter(BaseEstimator, TransformerMixin):
    def __init__(self, min_nnz=None):
        self.min_nnz = min_nnz

    def fit(self, X, y=None):
        self.sparsity = X.getnnz(0)
        return self

    def transform(self, X):
        return X[:, self.sparsity >= self.min_nnz]
</div>

In [None]:
pipeline = make_pipeline(
    JsonTransformer([('description', TransformText(['description'])), 
                     ('skills', create_skills_features)]),
    DictVectorizer(),
    SparsityFilter(min_nnz=25)
)

X = pipeline.fit_transform(data)
X.shape

You should see a shape that is much smaller than the original

Exercise
================

1. Build a model - try to predict the industry.
2. Evaluate its results using cross validation - what would be the best measure for this problem?

In [None]:
# write solution here

Click to see the solution

<div class="spoiler">

from xgboost import XGBClassifier
from sklearn.cross_validation import cross_val_predict

pipeline = make_pipeline(
    JsonTransformer([('description', TransformText('description')), 
                     ('technologies', create_technologies_features)]),
    DictVectorizer(),
    SparsityFilter(min_nnz=25),
    XGBClassifier()
)

X = data
y = [row['industries'][0] if len(row['industries']) else "" for row in data]

predictions = cross_val_predict(pipeline, X, y)

print("Accuracy = {}".format((predictions == np.array(y)).mean()))

</div>

In [None]:
from sklearn.cross_validation import cross_val_predict
from sklearn.naive_bayes import MultinomialNB

def makelist(x):
    return [x]
    
pipeline = make_pipeline(
    JsonTransformer([('description', TransformText(['description'])),
                     ('country', TransformText(['extension','geo_location','country'], makelist)),
                     ('skills', create_skills_features)]),
    DictVectorizer(),
    SparsityFilter(min_nnz=25),
    MultinomialNB()
)

X = data
y = [row['industries'][0] if len(row['industries']) else "" for row in data]

predictions = cross_val_predict(pipeline, X, y)

print("Accuracy = {}".format((predictions == np.array(y)).mean()))

In [None]:
len(set(y))