In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!unzip '/kaggle/input/spooky-author-identification/train.zip'
!unzip '/kaggle/input/spooky-author-identification/test.zip'

In [None]:
for dirname, _, filenames in os.walk(os.getcwd()):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd
import time
import gc

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## Advantages of using pipelines:
1. train and test set tranformations are taken care automatically
2. hyperparameter tuning is made easier. Set new parameters on any estimator in the pipeline and refit - all this is one line. Or use GridSearch on pipeline
3. Model description is easier

## 1. Data preparation

In [None]:
# read data
train = pd.read_csv('/kaggle/working/train.csv')
test = pd.read_csv('/kaggle/working/test.csv')

train.head(10)

In [None]:
# encode labels: 'author'
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(train['author'])
print('Classes encoded by Label encode', le.classes_)

train['author'] = le.transform(train['author'])

In [None]:
train.head()

__Split the train into train and validation sets__

In [None]:
from sklearn import model_selection, metrics

X = train['text']
y = train['author']

# use stratify to have equal proportion of author classes in the train and validation splits
X_train, X_val, y_train, y_val =  model_selection.train_test_split(X, y, test_size=0.3, shuffle=True, stratify = train['author'], random_state= 2020)

### __Create the first pipeline__

To understand the difference between `TfidfVectorizer` and `TfidfTransformer` refer [here](https://stackoverflow.com/questions/54745482/what-is-the-difference-between-tfidf-vectorizer-and-tfidf-transformer)

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression

pipe1 = Pipeline([
                ('cv', CountVectorizer()),
                ('tfidf_transformer', TfidfTransformer()),
                ('logit', LogisticRegression())              
            ])

### Fitting the model
Since our `pipe1` object has all the properties of an estimator, we can treat it as one. Hence we can use the `fit()`, `transform` and `fit_transform` method if it is defined for the `class`. For `Logistic Regression`, which is the final model in the pipeline, we will use only the fit method

In [None]:
pipe1.fit(X_train, y_train)

Steps of the pipeline

In [None]:
pipe1.steps

Accessing each step's parameters by name, methods and properties
* `pipeline.named_steps`: returns a dictionary with the keys as model names

In [None]:
# Coefficient of the Logistic Regression model
pipe1.named_steps['logit'].coef_

**Making predictions:**
- Use predict()
- Use predict_proba()

In [None]:
pred_val = pipe1.predict_proba(X_val)
print('Log loss on pipeline using cv-tfidftransformer-logit is : ', metrics.log_loss(y_val, pred_val))

### Parameter tuning in the pipeline
* We can get the parameters in each step by it's name

In [None]:
pipe1.named_steps['logit'].get_params()

### Getting the parameters all at once for all the steps in the pipeline
* paramters are prepended with the step name for easier identification

In [None]:
pipe1.get_params()

### Set the parameters within the pipeline
* set_params()

In [None]:
%%time
# set params and train
pipe1.set_params(cv__max_df = 0.8, cv__min_df = 6, cv__ngram_range=(1,3)).fit(X_train, y_train)
pred_val = pipe1.predict_proba(X_val)
print(metrics.log_loss(y_val, pred_val))

### Use a Naive Bayes model in the pipeline

In [None]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

pipe2 = Pipeline([
                    ('cv', CountVectorizer()),
                    ('tfidf_transformer', TfidfTransformer()),
                    ('bern_nb', BernoulliNB())
                ])

In [None]:
pipe2.fit(X_train, y_train)
print(metrics.log_loss(y_val, pipe2.predict_proba(X_val)))

### Feature Union
* Helps to combine new features created as part of EDA

- Now we will collect some meta features from the text like punctuation_count, parts of speech tag count

### NLTK - POS(parts of speech) tagger

In [None]:
import nltk
sample = train['text'].sample(1).values.tolist()[0]
print('text: \n', sample)
tokens = nltk.word_tokenize(sample)
pos_tag = nltk.pos_tag(tokens)

print(pos_tag)

In [None]:
sample_df = train.sample(frac=0.001)
sample_df['text']

In [None]:
from collections import Counter
Counter(tag for word, tag in pos_tag)


def cnt_pos(sentence):
    return Counter(tag for word, tag in nltk.pos_tag(nltk.word_tokenize(sentence)))

sample_df['text'].apply(cnt_pos).apply(pd.Series).fillna(0)

__More on the tags and what they represent__

In [None]:
nltk.help.upenn_tagset()

### 'Tag' text in the pipeline
Inorder to allow for 'tag' of text within the pipeline, we need to create a `transformer` or `estimator class`, which will `inherit` some `base classes` and `overload` a few functions that will need to be used

In [None]:
# import base classes
from sklearn.base import BaseEstimator, TransformerMixin
from collections import Counter

# define a new class, inherit from the base classes
class posTagMatrix(BaseEstimator, TransformerMixin):
    
    # use a custom tokenizer if not passed
    # normalize = True, divide by the total number of tags in the sentence
    
    def __init__(self, tokenizer = lambda x: x.split(), normalize = True):
        self.tokenizer = tokenizer
        self.normalize = normalize
        
    # helper function to tokenizer and count pos tags
    def count_pos(self, sentence):
        return Counter(tag for word, tag in nltk.pos_tag(self.tokenizer(sentence)))
    
    
    # this doesn't do anything and this makes it possible to use the fit() method
    def fit(self, X, y=None):
        return self
    
    
    # all the transformation is done here
    def transform(self, X):
        # the fit method needs to be applied to a series on text
        # this returns the count for individual tags in each sentence
        X_tagged = X.apply(self.count_pos).apply(pd.Series).fillna(0)
        # sum the count of all tags in the sentence
        X_tagged['total_pos_tags'] = X_tagged.apply(sum, axis=1)
        if self.normalize:
            X_tagged = X_tagged.divide(X_tagged['total_pos_tags'], axis=0)
            
        return X_tagged

### New pipeline with the pos tags included

In [None]:
from sklearn.pipeline import FeatureUnion

pipe3 = Pipeline([
                    ('u1', FeatureUnion([
                                            ('tfidf_features', Pipeline([
                                                                        ('cv', CountVectorizer()),
                                                                        ('tfidf_transformer', TfidfTransformer())
                                                                        ])),
                                            ('pos_features', Pipeline([
                                                                        ('pos', posTagMatrix(tokenizer= nltk.word_tokenize))
                                                                    ]))
                                        ])),
                
                    ('logit', LogisticRegression())
      
                ])


In [None]:
%%time
pipe3.fit(X_train, y_train)
pred_val = pipe3.predict_proba(X_val)
print(metrics.log_loss(y_val, pred_val))

### Overload the 'CountVectorizer' class
* this way the transformations can be applied to both 'train' and 'test' sets together
* refer this [link](https://stackoverflow.com/questions/400739/what-does-asterisk-mean-in-python) on what `*args` and `**kwargs` means. Basically they are used to pack extra arguments to a function

    * def f1(*a): 'a' will be a tuple of extra parameters
    * def f2(**a): 'a' will be a dictionary of extra parameters

In [None]:
# inherit from the base class: CountVectorizer
class CountVectorizerplus(CountVectorizer):
    
    def __init__(self, *args, add_test=None, **kwargs):
        self.add_test = add_test
        # initialize using the super class or parent class
        super().__init__(*args, **kwargs)   
    
    
    def transform(self, X):
        # use the transform method from the super class or parent class
        U = super().transform(X)
        return U
    
    def fit_transform(self, X , y=None):
        if self.add_test is not None:
            # add the test along with train
            X_new = pd.concat([X, self.add_test])
        else:
            X_new = X
            
        # Call the CountVectorizer.fit_transform() method
        # or using the method from the super or parent class
        
        # apply the fit_transform to both train and test, if test is also provided
        # using only the fit() method cause out of memory probaly because of the huge vocabulary from train and test combined
        # if that is the case, the other options to limit the number of max_features learned
        super().fit_transform(X_new, y)
        
        U = self.transform(X)
        
        return U

In [None]:
pipe4 = Pipeline([
                    ('cv', CountVectorizerplus(add_test = test['text'])),
                    ('tfidf_transformer', TfidfTransformer()),
                    ('ber_nb', BernoulliNB())
                ])

In [None]:
%%time
pipe4.fit(X_train, y_train)
pred_val = pipe4.predict_proba(X_val)

print('Metric log loss', metrics.log_loss(y_val, pred_val))