In [1]:
from utils import css_from_file
css_from_file('style/style.css')

In [2]:
import re
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import (
    CountVectorizer, TfidfVectorizer, HashingVectorizer
)

import warnings
warnings.filterwarnings("ignore")

Read csv file into a dataframe

In [102]:
df = pd.read_csv("data/ebaytitles.csv")
df = df.sample(frac=0.1) # delete this line if you are brave and have many GBs of RAM
df.head()

Unnamed: 0,title,category_name
876808,TITANIUM PURPLE 6PC SCREEN BOLT KIT FITS YAMAH...,Vehicle Parts & Accessories
850868,9ct Yellow Gold 4mm Mediumweight D-Shape Weddi...,Jewellery & Watches
926759,2.4G Wireless / Cordless Optical Mouse with US...,Computers/Tablets & Networking
840213,London Westminster River Thames View Large Wal...,Art
506121,EVOTECH ALLOY REAR WHEEL AXLE NUT BLUE FITS HO...,Vehicle Parts & Accessories


Print out unique values of a column

In [103]:
df.category_name.unique()

array(['Vehicle Parts & Accessories', 'Jewellery & Watches',
       'Computers/Tablets & Networking', 'Art',
       'Clothes, Shoes & Accessories', 'Mobile Phones & Communication',
       'Toys & Games', 'Musical Instruments & Gear', 'Garden & Patio',
       'Collectibles', 'Sound & Vision', 'Crafts', 'Sporting Goods',
       'Baby', 'Home, Furniture & DIY', 'Books, Comics & Magazines',
       'Health & Beauty', 'Sports Memorabilia',
       'Business, Office & Industrial', 'Music', 'Stamps', 'Pet Supplies',
       'Video Games & Consoles', 'DVDs, Films & TV', 'Coins & Paper Money',
       'Cameras & Photography', 'Consumer Electronics', 'Antiques',
       'Pottery, Porcelain & Glass', 'Holidays & Travel',
       'Wholesale & Job Lots', 'Dolls & Bears', 'Everything Else',
       'Cell Phones & Accessories', 'Travel', 'Entertainment Memorabilia'], dtype=object)

Split the data into train and test observations - there is a column

In [104]:
X = df.title.values
y = df.category_name.values

X_tr, X_te, y_tr, y_te = train_test_split(X, 
                                          y,
                                          test_size=0.1,
                                          random_state=0)

In [105]:
df

Unnamed: 0,title,category_name
876808,TITANIUM PURPLE 6PC SCREEN BOLT KIT FITS YAMAH...,Vehicle Parts & Accessories
850868,9ct Yellow Gold 4mm Mediumweight D-Shape Weddi...,Jewellery & Watches
926759,2.4G Wireless / Cordless Optical Mouse with US...,Computers/Tablets & Networking
840213,London Westminster River Thames View Large Wal...,Art
506121,EVOTECH ALLOY REAR WHEEL AXLE NUT BLUE FITS HO...,Vehicle Parts & Accessories
418283,Mens Denim Shirt By Soul Star Long Sleeved 'So...,"Clothes, Shoes & Accessories"
977743,Nokia 6220/6230 Black Leather Case,Mobile Phones & Communication
68541,Regatta Mens Bodywarmer Waistcoat Polycotton M...,"Clothes, Shoes & Accessories"
267261,Packhorse Bridge Jigsaw Puzzle - 2000 Pieces -...,Toys & Games
364163,BRAND NEW WINDOW SWITCH LIFTER WINDER SEAT LEO...,Vehicle Parts & Accessories


Exercise 
------------------

1. Count how many titles are in each category (```pandas.DataFrame.groupby```). Print out most common at the top

In [106]:
##########################
# put your solution here #
##########################
df.groupby('category_name').count().sort_values('title', ascending=False)

Unnamed: 0_level_0,title
category_name,Unnamed: 1_level_1
Vehicle Parts & Accessories,23088
"Clothes, Shoes & Accessories",16740
"Home, Furniture & DIY",12862
Computers/Tablets & Networking,6739
Jewellery & Watches,6327
Sporting Goods,4745
Mobile Phones & Communication,3990
Health & Beauty,3327
Crafts,3325
Toys & Games,2882


<a>Double click to show the solution</a>
<div class='spoiler'>

frequencies = df.groupby("category_name")["title"].count()
frequencies.sort_values(inplace=True,ascending=False)
print(frequencies)

# or faster

df.category_name.value_counts()

</div>

Bag of words
--------------------

Different types of vectorizers:

<ul>
<li>```sklearn.feature_extraction.text.CountVectorizer``` - Counts the number of times a word appears in the text</li>
<li>```sklearn.feature_extraction.text.TfidfVectorizer``` - Weighs the words according to the importance of the word in the context of whole collection. Is the word ```the``` important if it appears in all documents?</li>
<li>```sklearn.feature_extraction.text.HashingVectorizer``` - Useful when you don't know the vocabulary upfront. Feature number is calculated as ```hash(token) % vocabulary_size```.</li>
</ul>

Exercise
-------------------
1. Use ```CountVectorizer``` / ```TfidfVectorizer``` to fit the collection of documents
2. How many unique tokens are there in text? Print some examples (ie first few hundred).
3. What methods you can use to reduce this number? 
   - Check out and experiment with the arguments: ```ngram_range```, ```min_df```. How the vocabulary size changes with each change?
   - What would you replace / delete from the text?
4. Write a custom function `clean_text` that accepts a text as input and transforms it (remove/hash numbers, delete short/long words etc.)
5. (Extra points) When would you use ```HashingVectorizer```?

In [33]:

##########################
# put your solution here #
##########################
cv = CountVectorizer()
cv.fit(X_tr, y_tr)
#How many tokens 
print("How many unique tokens in a naive approach ???")
#print(sum([x[1] for x in cv.vocabulary_.items()]))
print(len(cv.vocabulary_))






How many unique tokens in a naive approach ???
71782


In [35]:
cv_min_5_max_07 = CountVectorizer(min_df=5, max_df=0.7)
cv_min_5_max_07.fit(X_tr, y_tr)
#How many tokens 
print("How many unique tokens in cv_min_5_max_07 ???")
print(len( cv_min_5_max_07.vocabulary_))


How many unique tokens in cv_min_5_max_07 ???
13882


In [37]:
cv_min_4_max_07 = CountVectorizer(min_df=4, max_df=0.7)
cv_min_4_max_07.fit(X_tr, y_tr)
#How many tokens 
print("How many unique tokens in cv_min_4_max_07 ???")
print(len( cv_min_4_max_07.vocabulary_))


How many unique tokens in cv_min_4_max_07 ???
16292


In [38]:
cv_min_3_max_07 = CountVectorizer(min_df=3, max_df=0.7)
cv_min_3_max_07.fit(X_tr, y_tr)
#How many tokens 
print("How many unique tokens in cv_min_3_max_07 ???")
print(len( cv_min_3_max_07.vocabulary_))


How many unique tokens in cv_min_3_max_07 ???
20211


In [39]:
cv_min_2_max_07 = CountVectorizer(min_df=2, max_df=0.7)
cv_min_2_max_07.fit(X_tr, y_tr)
#How many tokens 
print("How many unique tokens in cv_min_2_max_07 ???")
print(len( cv_min_2_max_07.vocabulary_))

How many unique tokens in cv_min_2_max_07 ???
29259


In [51]:
cv_min_5_max_07_ngr = CountVectorizer(min_df=5, max_df=0.7, ngram_range=(1,2))
cv_min_5_max_07_ngr.fit(X_tr, y_tr)
#How many tokens 
print("How many unique tokens in cv_min_5_max_07_ngr ???")
print(len( cv_min_5_max_07_ngr.vocabulary_))


How many unique tokens in cv_min_5_max_07_ngr ???
36471


In [69]:
import re
def clean_text(str):
    str = str.lower()
    str = re.sub(r'\b[0-9]+[A-Za-z]+\b', '#', str)
    str = re.sub(r'\b[0-9]+\b', '#', str)
    str = re.sub(r'\b[A-Za-z]+[0-9]+\b', '#', str)
    return str
    
    
print(clean_text('test 1'))    
print(clean_text('200 test '))    
print(clean_text('200gb test'))    


    
    

test #
# test 
# test


In [59]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
cv_min_5_max_07_ngr_ctxt = CountVectorizer(min_df=5, max_df=0.7, ngram_range=(1,2), preprocessor=clean_text, stop_words=stop_words)
cv_min_5_max_07_ngr_ctxt.fit(X_tr, y_tr)
#How many tokens 
print("How many unique tokens in cv_min_5_max_07_ngr ???")
print(len( cv_min_5_max_07_ngr_ctxt.vocabulary_))
[x for x in cv_min_5_max_07_ngr_ctxt.vocabulary_.items() if len(x[0].split()) > 1]

How many unique tokens in cv_min_5_max_07_ngr ???
30109


[('brake pipe', 3178),
 ('blue carbon', 2551),
 ('carbon fiber', 4192),
 ('fiber vinyl', 9185),
 ('vinyl film', 28416),
 ('film wrap', 9275),
 ('wrap sheet', 29681),
 ('sheet sticker', 23520),
 ('sticker tint', 25450),
 ('new cd', 17727),
 ('black diamond', 2233),
 ('edible cup', 8251),
 ('cup cake', 6455),
 ('cake toppers', 3861),
 ('toppers rice', 27097),
 ('rice paper', 21989),
 ('adsl modem', 231),
 ('lente designs', 14964),
 ('designs apple', 7074),
 ('apple ipad', 741),
 ('cover case', 6071),
 ('designer fabric', 7054),
 ('new authentic', 17697),
 ('authentic tag', 1107),
 ('tag heuer', 26286),
 ('heuer carrera', 12213),
 ('leather strap', 14787),
 ('jigsaw puzzle', 13537),
 ('hi fi', 12232),
 ('blue led', 2590),
 ('mens lambretta', 16330),
 ('mod retro', 16907),
 ('union jack', 27891),
 ('jack flag', 13356),
 ('hand made', 11761),
 ('snapback mens', 24598),
 ('mens cap', 16273),
 ('cap blue', 4021),
 ('blue headwear', 2579),
 ('white one', 29160),
 ('one size', 18486),
 ('carrie

In [None]:

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
cv_min_5_max_07_ngr_ctxt = CountVectorizer(min_df=5, max_df=0.7, ngram_range=(1,2), preprocessor=clean_text, stop_words=stop_words)
cv_min_5_max_07_ngr_ctxt.fit(X_tr, y_tr)
#How many tokens 
print("How many unique tokens in cv_min_5_max_07_ngr ???")
print(len( cv_min_5_max_07_ngr_ctxt.vocabulary_))
[x for x in cv_min_5_max_07_ngr_ctxt.vocabulary_.items() if len(x[0].split()) > 1]

<a>Double click to show the solution</a>
<div class='spoiler'>
from sklearn.feature_extraction.text import CountVectorizer
import re

def clean_text(t):
    t = t.lower()
    t = re.sub("[^A-Za-z0-9]"," ",t)
    t = re.sub("[0-9]+","#",t)
    return t

vectorizers = [
     ("vanilla",
          CountVectorizer())
    ,("preprocessing",
          CountVectorizer(preprocessor=clean_text))
    ,("preprocessing + min_df=10",
          CountVectorizer(preprocessor=clean_text,
                          min_df=10))
]

for vect_name, vect in vectorizers:
    print(vect_name)
    vect.fit(X_tr)
    
    print(list(vect.get_feature_names())[:10])
    print(len(vect.get_feature_names()))
</div>

Stemming
------------------

Linguistic normalization in which variant forms are reduced to a common form

    connection
    connections
    connective     --->   connect
    connected
    connecting
    
Usage:

    import snowballstemmer

    stemmer = snowballstemmer.stemmer('english')
    print(stemmer.stemWords("We are the world".split()))

In [61]:
import snowballstemmer

stemmer = snowballstemmer.stemmer('english')
print(stemmer.stemWords("We are the world".split()))

['We', 'are', 'the', 'world']


Putting it into a pipeline
----------------------

Now that we know how to transform text data, let's put it into a pipeline.

1. Create a pipeline with `CountVectorizer`, `StandardScaler` and `SGDClassifier` as your final algorithm
    a) use alternative format for pipeline definition when you name the steps - refer to the documentation how to do this
2. Using ```sklearn.metrics.classification_report``` create a report about your classifier

In [89]:
##########################
# put your solution here #
##########################

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV

import snowballstemmer

stemmer = snowballstemmer.stemmer('english')

from nltk.tokenize import word_tokenize

def clean_text2(str):
    str = str.lower()
    str = re.sub(r'\b[0-9]+[A-Za-z]+\b', '', str)
    str = re.sub(r'\b[0-9]+\b', '', str)
    str = re.sub(r'\b[A-Za-z]+[0-9]+\b', '', str)
    return str

def stemmed_words(doc):
    return [stemmer.stemWords(w) for w in doc]

def tok_with_stemmer(doc):
    tokenized =  word_tokenize(doc)
    #return tokenized
    return stemmer.stemWords(tokenized)
    

#clf = make_pipeline(CountVectorizer(), SGDClassifier())


In [None]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(min_df=5, max_df=0.7, ngram_range=(1,2), preprocessor=clean_text2, stop_words=stop_words, 
                                   tokenizer=tok_with_stemmer)),
    ('scaler', StandardScaler(copy=True, with_mean=False, with_std=False)),
    ('sgd', SGDClassifier())
])

preds = cross_val_predict(clf, 
                          X_tr, 
                          y_tr, 
                          cv=8, n_jobs=-1, verbose=True)

print(classification_report(y_tr, preds))

clf.fit(X_tr, y_tr)

In [90]:
clf2 = Pipeline([
    ('vectorizer', CountVectorizer(min_df=5, max_df=0.7, ngram_range=(1,2), preprocessor=clean_text2, stop_words=stop_words, 
                                   tokenizer=tok_with_stemmer)),
    ('scaler', StandardScaler(copy=True, with_mean=False, with_std=False)),
    ('sgd', SGDClassifier())
])

preds = cross_val_predict(clf2, 
                          X_tr, 
                          y_tr, 
                          cv=8, n_jobs=-1, verbose=True)

print(classification_report(y_tr, preds))

clf2.fit(X_tr, y_tr)


[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:  2.3min remaining:  6.9min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  2.3min finished


                                precision    recall  f1-score   support

                      Antiques       0.74      0.36      0.48       337
                           Art       0.72      0.73      0.72       981
                          Baby       0.88      0.70      0.78      1287
     Books, Comics & Magazines       0.88      0.78      0.83       793
 Business, Office & Industrial       0.80      0.64      0.71      5133
         Cameras & Photography       0.92      0.88      0.90      1626
     Cell Phones & Accessories       0.94      0.22      0.36       131
  Clothes, Shoes & Accessories       0.93      0.97      0.95     30295
           Coins & Paper Money       0.93      0.90      0.91       373
                  Collectibles       0.84      0.73      0.78      4460
Computers/Tablets & Networking       0.93      0.95      0.94     12479
          Consumer Electronics       0.79      0.45      0.58       220
                        Crafts       0.89      0.87      0.88  

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=5,
        ngram_range=(1, 2),
        preprocessor=<function clean_text2 ...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))])

In [111]:
clf3 = Pipeline([
    ('vectorizer', TfidfVectorizer(min_df=5, max_df=0.7, ngram_range=(1,2), preprocessor=clean_text2, stop_words=stop_words, 
                                   tokenizer=tok_with_stemmer)),
    ('scaler', StandardScaler(copy=True, with_mean=False, with_std=False)),
    ('sgd', SGDClassifier(alpha=0.001))
])

preds = cross_val_predict(clf, 
                          X_tr, 
                          y_tr, 
                          cv=8, n_jobs=-1, verbose=True)

print(classification_report(y_tr, preds))

clf3.fit(X_tr, y_tr)

[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed: 11.8min remaining: 35.5min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed: 12.0min finished


                                precision    recall  f1-score   support

                      Antiques       0.67      0.08      0.15       195
                           Art       0.66      0.68      0.67       542
                          Baby       0.82      0.61      0.70       649
     Books, Comics & Magazines       0.81      0.65      0.72       402
 Business, Office & Industrial       0.79      0.46      0.58      2586
         Cameras & Photography       0.87      0.77      0.81       778
     Cell Phones & Accessories       0.00      0.00      0.00        53
  Clothes, Shoes & Accessories       0.87      0.97      0.92     15037
           Coins & Paper Money       0.86      0.78      0.82       171
                  Collectibles       0.82      0.58      0.68      2229
Computers/Tablets & Networking       0.89      0.93      0.91      6080
          Consumer Electronics       0.88      0.16      0.27       130
                        Crafts       0.84      0.77      0.80  

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=5,
        ngram_range=(1, 3), norm='l2',
        preprocessor=<function c...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))])

<a>Double click to show the solution</a>
<div class='spoiler'>

from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV

clf = Pipeline([('vect', CountVectorizer(min_df=10, preprocessor=clean_text)),
                ('scaling', StandardScaler(with_mean=False)),
                ('clf', SGDClassifier())])

preds = cross_val_predict(clf, 
                          X_tr, 
                          y_tr, 
                          cv=8, n_jobs=-1, verbose=True)

print(classification_report(y_tr, preds)


</div>


Grid search
--------------------------

Scikit-learn has `GridSearchCV` and `RandomizedSearchCV`. Both have the same functionality and can be used to find good parameters for the models. What is great about both these classes that they are both transformers - they return an estimator so you can chain them and put in your pipeline.

**GridSearchCV** - you specify the exact values of the parameters you want to test
**RandomizedSearchCV** - you specify ranges of parameters

Exercise
----------------------

1. Use `GridSearchCV` or `RandomizedSearchCV` to find the best parameters for the models. Check at least 2 parameters.

2. Inspect the attribute `cv_results_` after fitting. It gives a nice representation of the learning.

In [115]:
##########################
# put your solution here #
##########################
pipeline = Pipeline([
    ('vect', CountVectorizer(preprocessor=clean_text2, stop_words=stop_words, 
                                   tokenizer=tok_with_stemmer, min_df=5, ngram_range=(1,2), )),
    ('scaler', StandardScaler(copy=True, with_mean=False, with_std=False)),
    ('sgd', SGDClassifier(alpha= 0.0001))
])

clf = GridSearchCV(pipeline, { 
                       # 'sgd__alpha':[0.0001, 0.00001]
                        #'vect__min_df':[1,5,10,25],
                         'vect__max_df':[0.65, 0.7, 0.75]
                        #'vect__ngram_range':[(1,1),(1,2),(1,3)]
})

clf.fit(X_tr, y_tr)

print(clf.best_params_)
print(clf.best_score_)



{'vect__max_df': 0.7}
0.8834888888888889


<a>Double click to show the solution</a>
<div class='spoiler'>

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

print("Grid search")
print()

params = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
          'vect__analyzer': ["word","char"],
          'vect__binary': [True, False]}

grid_clf = GridSearchCV(clf, params, n_jobs=1, verbose=True)
grid_clf.fit(X_tr, y_tr)

best_params = sorted(grid_clf.grid_scores_, key=lambda x: -x[1])


for params, score, _ in best_params:
    print(score, params) 
    
print("Randomized search")
print()
    
params = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
          'vect__analyzer': ["word","char"],
          'model__lr__dimensions': [100, 200]}

grid_clf = RandomizedSearchCV(clf, params, n_jobs=1, verbose=True, n_iter=8)
grid_clf.fit(np.array(X_tr[:10000]), y_tr[:10000])

best_params = sorted(grid_clf.grid_scores_, key=lambda x: -x[1])

for params, score, _ in best_params:
    print(score, params)

</div>


Useful materials

1. http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
2. http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html