In [1]:
import pandas as pd

In [2]:
# !unzip ds4-which-whisk.zip

In [5]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [6]:
train = train.dropna()

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
# Initialize RFC and Vectorizer
rfc = RandomForestClassifier()
vect = TfidfVectorizer(stop_words='english')

In [9]:
# Define pipeline
pipe = Pipeline([('vect', vect), ('rfc', rfc)])

In [10]:
# Fit pipeline
pipe.fit(train['description'], train['category'])



Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

# Making a Submission

In [28]:
# Test Pipeline
y_pred = pipe.predict(test['description'])

In [29]:
submission = pd.DataFrame({'id': test['id'], 'category':y_pred})
submission['category'] = submission['category'].astype('int64')

In [30]:
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,1
3,1024,1
4,1902,1


In [31]:
submission.to_csv('submission2.csv', index=False)

In [32]:
df = pd.read_csv('submission2.csv')
df.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,1
3,1024,1
4,1902,1


# Tune Pipeline with Grid Search

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (100, 500, 1000),
    'rfc__n_estimators': (20, 100, 400)
}

In [20]:
gs = GridSearchCV(pipe, parameters, cv=5, n_jobs=4)

In [21]:
gs.fit(train['description'], train['category'])

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'vect__max_df': (0.5, 0.75, 1.0), 'vect__min_df': (0.02, 0.05), 'vect__max_features': (100, 500, 1000), 'rfc__n_estimators': (20, 100, 400)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [22]:
gs.best_score_

0.8990306946688207

In [23]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [33]:
gs.get_params()

{'cv': 5,
 'error_score': 'raise-deprecating',
 'estimator__memory': None,
 'estimator__steps': [('vect',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words='english', strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('rfc',
   RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
               max_depth=None, max_features='auto', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
               oob_score=False, random_state=None, verbose=

## Retrain

In [40]:
# Initialize RFC and Vectorizer
rfc = RandomForestClassifier(n_estimators=10)
vect = TfidfVectorizer(stop_words='english', min_df=0.02, max_df=1.0, max_features=1000)

In [26]:
# Define pipeline
pipe = Pipeline([('vect', vect), ('rfc', rfc)])

In [27]:
# Fit pipeline
pipe.fit(train['description'], train['category'])



Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

# Word Embeddings with Spacy

In [34]:
import spacy

In [36]:
nlp = spacy.load("en_core_web_md")

In [37]:
embeddings = [nlp(doc).vector for doc in train['description']]

In [41]:
rfc.fit(embeddings, train['category'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [42]:
rfc.score(embeddings, train['category'])

0.9870759289176091

In [43]:
test_embeddings = [nlp(doc).vector for doc in test['description']]

In [44]:
# Test Pipeline
y_pred = rfc.predict(test_embeddings)

In [45]:
submission = pd.DataFrame({'id': test['id'], 'category':y_pred})
submission['category'] = submission['category'].astype('int64')

In [46]:
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,1
2,1390,4
3,1024,1
4,1902,1


In [47]:
submission.to_csv('submission3.csv', index=False)

In [48]:
df = pd.read_csv('submission3.csv')
df.head()

Unnamed: 0,id,category
0,955,2
1,3532,1
2,1390,4
3,1024,1
4,1902,1


# Custom Tokenization

In [93]:
doc = train['description'][10]

In [94]:
doc = nlp(doc)

In [95]:
doc

The complete package: uncut, unfiltered, full-flavored, richly textured (almost chewy), and very complex. Notes of toffee-coated nuts, vanilla fudge, polished leather, cedar-tinged tobacco, barrel char, cocoa powder, and a hint of fig, wrapped up with a firm oak grip on the finish. Worth every penny of the premium price being charged for this commemorative release. Editor's Choice.

In [99]:
for sentence in doc.sents:
    print(sentence)

The complete package: uncut, unfiltered, full-flavored, richly textured (almost chewy), and very complex.
Notes of toffee-coated nuts, vanilla fudge, polished leather, cedar-tinged tobacco, barrel char, cocoa powder, and a hint of fig, wrapped up with a firm oak grip on the finish.
Worth every penny of the premium price being charged for this commemorative release.
Editor's Choice.


In [66]:
def tokenize(doc):
    
    d = nlp(doc)
    
    tokens = []
    
    for chunk in d.noun_chunks:
        tokens.append(chunk.lemma_)
        
    return tokens

In [67]:
from sklearn.feature_extraction.text import CountVectorizer

In [77]:
vect = CountVectorizer(analyzer=tokenize, min_df=.1, max_df=.9)

In [78]:
vect.fit(train['description'])

CountVectorizer(analyzer=<function tokenize at 0x1a23b0f8c8>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=0.9,
        max_features=None, min_df=0.1, ngram_range=(1, 1),
        preprocessor=None, stop_words=None, strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [79]:
vect.get_feature_names()

['-PRON-',
 'a hint',
 'caramel',
 'cinnamon',
 'honey',
 'the finish',
 'the nose',
 'the palate',
 'vanilla',
 'water']

In [89]:
def tokenize(doc):
    
    d = nlp(doc)
    tokens = []
    
    
    # You can extract anything you want from Spacy to use as a feature
    # Noun Chunks aren't the end all be all. 
    # Try Extracting just tokens, or just adjectives, or just pronouns
    
    for chunk in d.noun_chunks:
        tokens.append(chunk.text)
        
    return tokens

In [90]:
vect = CountVectorizer(analyzer=tokenize, min_df=.1, max_df=.9)

In [91]:
vect.fit(train['description'])

CountVectorizer(analyzer=<function tokenize at 0x1a44879a60>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=0.9,
        max_features=None, min_df=0.1, ngram_range=(1, 1),
        preprocessor=None, stop_words=None, strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [92]:
vect.get_feature_names()

['I',
 'It',
 'The palate',
 'a hint',
 'caramel',
 'cinnamon',
 'honey',
 'it',
 'the finish',
 'the nose',
 'the palate',
 'vanilla',
 'you']

In [101]:
X = vect.transform(train['description'])

In [102]:
rfc.fit(X, train['category'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [103]:
rfc.score(X, train['category'])

0.7495961227786753