In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/spooky-author-identification/train.zip
/kaggle/input/spooky-author-identification/sample_submission.zip
/kaggle/input/spooky-author-identification/test.zip


In [2]:
!unzip '/kaggle/input/spooky-author-identification/train.zip'
!unzip '/kaggle/input/spooky-author-identification/test.zip'

Archive:  /kaggle/input/spooky-author-identification/train.zip
  inflating: train.csv               
Archive:  /kaggle/input/spooky-author-identification/test.zip
  inflating: test.csv                


In [3]:
for dirname, _, filenames in os.walk(os.getcwd()):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/working/train.csv
/kaggle/working/test.csv
/kaggle/working/__notebook_source__.ipynb


In [4]:
import numpy as np
import pandas as pd
import time
import gc

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## Advantages of using pipelines:
1. train and test set tranformations are taken care automatically
2. hyperparameter tuning is made easier. Set new parameters on any estimator in the pipeline and refit - all this is one line. Or use GridSearch on pipeline
3. Model description is easier

## 1. Data preparation

In [5]:
# read data
train = pd.read_csv('/kaggle/working/train.csv')
test = pd.read_csv('/kaggle/working/test.csv')

train.head(10)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL
5,id22965,"A youth passed in solitude, my best years spen...",MWS
6,id09674,"The astronomer, perhaps, at this point, took r...",EAP
7,id13515,The surcingle hung in ribands from my body.,EAP
8,id19322,I knew that you could not say to yourself 'ste...,EAP
9,id00912,I confess that neither the structure of langua...,MWS


In [6]:
# encode labels: 'author'
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(train['author'])
print('Classes encoded by Label encode', le.classes_)

train['author'] = le.transform(train['author'])

Classes encoded by Label encode ['EAP' 'HPL' 'MWS']


In [7]:
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",0
1,id17569,It never once occurred to me that the fumbling...,1
2,id11008,"In his left hand was a gold snuff box, from wh...",0
3,id27763,How lovely is spring As we looked from Windsor...,2
4,id12958,"Finding nothing else, not even gold, the Super...",1


__Split the train into train and validation sets__

In [8]:
from sklearn import model_selection, metrics

X = train['text']
y = train['author']

# use stratify to have equal proportion of author classes in the train and validation splits
X_train, X_val, y_train, y_val =  model_selection.train_test_split(X, y, test_size=0.3, shuffle=True, stratify = train['author'], random_state= 2020)

### __Create the first pipeline__

To understand the difference between `TfidfVectorizer` and `TfidfTransformer` refer [here](https://stackoverflow.com/questions/54745482/what-is-the-difference-between-tfidf-vectorizer-and-tfidf-transformer)

In [9]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression

pipe1 = Pipeline([
                ('cv', CountVectorizer()),
                ('tfidf_transformer', TfidfTransformer()),
                ('logit', LogisticRegression())              
            ])

### Fitting the model
Since our `pipe1` object has all the properties of an estimator, we can treat it as one. Hence we can use the `fit()`, `transform` and `fit_transform` method if it is defined for the `class`. For `Logistic Regression`, which is the final model in the pipeline, we will use only the fit method

In [10]:
pipe1.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf_transformer',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('logit',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_s

Steps of the pipeline

In [11]:
pipe1.steps

[('cv',
  CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                  dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                  lowercase=True, max_df=1.0, max_features=None, min_df=1,
                  ngram_range=(1, 1), preprocessor=None, stop_words=None,
                  strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                  tokenizer=None, vocabulary=None)),
 ('tfidf_transformer',
  TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
 ('logit',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False))]

Accessing each step's parameters by name, methods and properties
* `pipeline.named_steps`: returns a dictionary with the keys as model names

In [12]:
# Coefficient of the Logistic Regression model
pipe1.named_steps['logit'].coef_

array([[ 0.032849  ,  0.01721665,  0.17807575, ...,  0.06896037,
        -0.01142085, -0.23908224],
       [-0.02068407, -0.00707508, -0.08608624, ..., -0.0125657 ,
         0.02025188,  0.24707146],
       [-0.01216493, -0.01014157, -0.09198951, ..., -0.05639467,
        -0.00883103, -0.00798922]])

**Making predictions:**
- Use predict()
- Use predict_proba()

In [13]:
pred_val = pipe1.predict_proba(X_val)
print('Log loss on pipeline using cv-tfidftransformer-logit is : ', metrics.log_loss(y_val, pred_val))

Log loss on pipeline using cv-tfidftransformer-logit is :  0.5532037740068139


### Parameter tuning in the pipeline
* We can get the parameters in each step by it's name

In [14]:
pipe1.named_steps['logit'].get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

### Getting the parameters all at once for all the steps in the pipeline
* paramters are prepended with the step name for easier identification

In [15]:
pipe1.get_params()

{'memory': None,
 'steps': [('cv',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                   lowercase=True, max_df=1.0, max_features=None, min_df=1,
                   ngram_range=(1, 1), preprocessor=None, stop_words=None,
                   strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=None, vocabulary=None)),
  ('tfidf_transformer',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('logit',
   LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=100,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'cv': CountVectorizer(analyzer='word', binary=

### Set the parameters within the pipeline
* set_params()

In [16]:
%%time
# set params and train
pipe1.set_params(cv__max_df = 0.8, cv__min_df = 6, cv__ngram_range=(1,3)).fit(X_train, y_train)
pred_val = pipe1.predict_proba(X_val)
print(metrics.log_loss(y_val, pred_val))

0.5650860396812747
CPU times: user 9.76 s, sys: 622 ms, total: 10.4 s
Wall time: 8.36 s


### Use a Naive Bayes model in the pipeline

In [17]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

pipe2 = Pipeline([
                    ('cv', CountVectorizer()),
                    ('tfidf_transformer', TfidfTransformer()),
                    ('bern_nb', BernoulliNB())
                ])

In [18]:
pipe2.fit(X_train, y_train)
print(metrics.log_loss(y_val, pipe2.predict_proba(X_val)))

0.5409585440753816


### Feature Union
* Helps to combine new features created as part of EDA

- Now we will collect some meta features from the text like punctuation_count, parts of speech tag count

### NLTK - POS(parts of speech) tagger

In [19]:
import nltk
sample = train['text'].sample(1).values.tolist()[0]
print('text: \n', sample)
tokens = nltk.word_tokenize(sample)
pos_tag = nltk.pos_tag(tokens)

print(pos_tag)

text: 
 Yet I thank God that I have lived I thank God, that I have beheld his throne, the heavens, and earth, his footstool.
[('Yet', 'RB'), ('I', 'PRP'), ('thank', 'VBP'), ('God', 'NNP'), ('that', 'IN'), ('I', 'PRP'), ('have', 'VBP'), ('lived', 'VBN'), ('I', 'PRP'), ('thank', 'VBP'), ('God', 'NNP'), (',', ','), ('that', 'IN'), ('I', 'PRP'), ('have', 'VBP'), ('beheld', 'VBN'), ('his', 'PRP$'), ('throne', 'NN'), (',', ','), ('the', 'DT'), ('heavens', 'NNS'), (',', ','), ('and', 'CC'), ('earth', 'NN'), (',', ','), ('his', 'PRP$'), ('footstool', 'NN'), ('.', '.')]


In [20]:
sample_df = train.sample(frac=0.001)
sample_df['text']

3322     It was a long dog trot to the ruined station, ...
1733     I hear a noise at the door, as of some immense...
13517                    I was then only twenty years old.
13474    I would have escorted her myself, but felt tha...
10821    We were all equal now; magnificent dwellings, ...
12508    With this, she took my arm, and I attended her...
411      Then too, we had spoken to him in advance of o...
5106     It is rather fortunate that Elwood was not in ...
4414     Most of these, however, soon shewed their pove...
18441    Into the granite city of Teloth wandered the y...
13107    He was not, as the other traveller seemed to b...
6790     Why need I paint, Charmion, the now disenchain...
6731            I often said to myself, my father is dead.
4504     There was a hideous screaming which echoed abo...
8453     "That is, sailors that didn't hail from Innsmo...
15933    P. Still, there is one of your expressions whi...
10292    It had not been their intention to return; but.

In [21]:
from collections import Counter
Counter(tag for word, tag in pos_tag)


def cnt_pos(sentence):
    return Counter(tag for word, tag in nltk.pos_tag(nltk.word_tokenize(sentence)))

sample_df['text'].apply(cnt_pos).apply(pd.Series).fillna(0)

Unnamed: 0,PRP,VBD,DT,JJ,NN,TO,VBN,",",CC,VBZ,...,WRB,:,POS,NNP,JJS,EX,WDT,``,'',PDT
3322,2.0,2.0,4.0,4.0,5.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1733,2.0,0.0,3.0,1.0,4.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13517,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13474,4.0,3.0,2.0,1.0,2.0,1.0,1.0,3.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10821,1.0,2.0,2.0,3.0,0.0,1.0,1.0,3.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12508,3.0,2.0,1.0,0.0,2.0,0.0,0.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
411,5.0,1.0,1.0,2.0,3.0,1.0,1.0,1.0,2.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5106,1.0,1.0,2.0,4.0,2.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
4414,2.0,2.0,4.0,3.0,7.0,1.0,0.0,3.0,3.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
18441,0.0,1.0,4.0,3.0,10.0,0.0,2.0,2.0,1.0,1.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


__More on the tags and what they represent__

In [22]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

### 'Tag' text in the pipeline
Inorder to allow for 'tag' of text within the pipeline, we need to create a `transformer` or `estimator class`, which will `inherit` some `base classes` and `overload` a few functions that will need to be used

In [23]:
# import base classes
from sklearn.base import BaseEstimator, TransformerMixin
from collections import Counter

# define a new class, inherit from the base classes
class posTagMatrix(BaseEstimator, TransformerMixin):
    
    # use a custom tokenizer if not passed
    # normalize = True, divide by the total number of tags in the sentence
    
    def __init__(self, tokenizer = lambda x: x.split(), normalize = True):
        self.tokenizer = tokenizer
        self.normalize = normalize
        
    # helper function to tokenizer and count pos tags
    def count_pos(self, sentence):
        return Counter(tag for word, tag in nltk.pos_tag(self.tokenizer(sentence)))
    
    
    # this doesn't do anything and this makes it possible to use the fit() method
    def fit(self, X, y=None):
        return self
    
    
    # all the transformation is done here
    def transform(self, X):
        # the fit method needs to be applied to a series on text
        # this returns the count for individual tags in each sentence
        X_tagged = X.apply(self.count_pos).apply(pd.Series).fillna(0)
        # sum the count of all tags in the sentence
        X_tagged['total_pos_tags'] = X_tagged.apply(sum, axis=1)
        if self.normalize:
            X_tagged = X_tagged.divide(X_tagged['total_pos_tags'], axis=0)
            
        return X_tagged

### New pipeline with the pos tags included

In [24]:
from sklearn.pipeline import FeatureUnion

pipe3 = Pipeline([
                    ('u1', FeatureUnion([
                                            ('tfidf_features', Pipeline([
                                                                        ('cv', CountVectorizer()),
                                                                        ('tfidf_transformer', TfidfTransformer())
                                                                        ])),
                                            ('pos_features', Pipeline([
                                                                        ('pos', posTagMatrix(tokenizer= nltk.word_tokenize))
                                                                    ]))
                                        ])),
                
                    ('logit', LogisticRegression())
      
                ])


In [25]:
%%time
pipe3.fit(X_train, y_train)
pred_val = pipe3.predict_proba(X_val)
print(metrics.log_loss(y_val, pred_val))

0.6174722434255739
CPU times: user 1min 6s, sys: 879 ms, total: 1min 7s
Wall time: 1min 5s


### Overload the 'CountVectorizer' class
* this way the transformations can be applied to both 'train' and 'test' sets together
* refer this [link](https://stackoverflow.com/questions/400739/what-does-asterisk-mean-in-python) on what `*args` and `**kwargs` means. Basically they are used to pack extra arguments to a function

    * def f1(*a): 'a' will be a tuple of extra parameters
    * def f2(**a): 'a' will be a dictionary of extra parameters

In [26]:
# inherit from the base class: CountVectorizer
class CountVectorizerplus(CountVectorizer):
    
    def __init__(self, *args, add_test=None, **kwargs):
        self.add_test = add_test
        # initialize using the super class or parent class
        super().__init__(*args, **kwargs)   
    
    
    def transform(self, X):
        # use the transform method from the super class or parent class
        U = super().transform(X)
        return U
    
    def fit_transform(self, X , y=None):
        if self.add_test is not None:
            # add the test along with train
            X_new = pd.concat([X, self.add_test])
        else:
            X_new = X
            
        # Call the CountVectorizer.fit_transform() method
        # or using the method from the super or parent class
        
        # apply the fit_transform to both train and test, if test is also provided
        # using only the fit() method cause out of memory probaly because of the huge vocabulary from train and test combined
        # if that is the case, the other options to limit the number of max_features learned
        #super().fit_transform(X_new, y)
        super().fit_transform(X_new, y)
        
        U = self.transform(X)
        
        return U

In [27]:
pipe4 = Pipeline([
                    ('cv', CountVectorizerplus(add_test = test['text'])),
                    ('tfidf_transformer', TfidfTransformer()),
                    ('ber_nb', BernoulliNB())
                ])

In [28]:
%%time
pipe4.fit(X_train, y_train)
pred_val = pipe4.predict_proba(X_val)

print('Metric log loss', metrics.log_loss(y_val, pred_val))

Metric log loss 0.5614716252475621
CPU times: user 2.15 s, sys: 0 ns, total: 2.15 s
Wall time: 2.15 s


__Limiting the number of max_features learned__

    * By limiting the number of `max_features' in CountVectorizer, we get a slightly better score
    * This parameters is passed and intiliazed using the based class of Countvectorizer and not is not blanketed under args and kwargs

In [29]:
%%time
pipe5 = Pipeline([
                    ('cv', CountVectorizerplus(add_test = test['text'], max_features =15000)),
                    ('tfidf_transformer', TfidfTransformer()),
                    ('ber_nb', BernoulliNB())
                ])


pipe5.fit(X_train, y_train)
pred_val = pipe5.predict_proba(X_val)

print('Metric log loss', metrics.log_loss(y_val, pred_val))

Metric log loss 0.517943439619949
CPU times: user 2.12 s, sys: 3.87 ms, total: 2.12 s
Wall time: 2.12 s


### Stacking pipelines
Pipelines cannot have more than one final estimator. So the below cannot execute

In [None]:
# pipe6 = Pipeline([
#                     ('u1', FeatureUnion([
#                                             ('tfidf_features', Pipeline([
#                                                                         ('cv', CountVectorizer()),
#                                                                         ('tfidf', TfidfTransformer()),
#                                                                         ('tfidf_logit', LogisticRegression())
#                                                                         ])),
#                                             ('pos_features', Pipeline([
#                                                                         ('pos', PosTagMatrix(tokenizer = nltk.word_tokenize)),
#                                                                         ('pos_logit', LogistiRegression())
#                                                                         ])),
#                     ('xgb', XGBClassifier())

#                                         ]))
#         ])

### Stacking trick:

To makes use of the classifier in multiple stages of pipeline, create a `wrapper` around the `classifier`. By doing so we will convert it the `classifier` into a `transformer` class, which will transform the input data into predictions

In [31]:
class Classifierwrapper(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator, verbose = None, fit_params = None, use_proba = True, scoring = None):
        
        self.estimator = estimator
        
        # control verbosity: # 0 means no verbose, 1 - moderately verbose, 2 - more verbose
        # self.verbose = verbose
        
        if verbose is None:
            self.verbose = 0
        else:
            self.verbose = verbose
        
        # if need to fit parameters
        self.fit_params = fit_params
        
        # if need to use predict_proba in transform method
        self.use_proba = use_proba
        
        # calculate the validation score based on the scoring function
        self.scoring = scoring
        
        # variable to store the store if the scoring function is set
        self.score = None
        
        
    def fit(self, X, y):
        fp = self.fit_params
        
        if self.verbose == 2:
            print('X shape', X.shape, "\n fit params:", self.fit_params)
        
        if fp is not None:
            self.estimator.fit(X, y, **fp)
        else:
            self.estimator.fit(X, y)
        
        return self
    
    
    def transform(self, X):
        # is the estimator needs to use the predict_proba method
        if self.use_proba:
            return self.estimator.predict_proba(X)
        else:
            return self.estimator.predict(X)
        
        
    def fit_transform(self, X, y, **kwargs):
        self.fit(X, y)
        p = self.transform(X)
        return p
    
    
    def predict(self, X):
        return self.estimator.predict(X)
    
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

Using XGBClassifier as the final estimator in the pipeline

In [34]:
from xgboost import XGBClassifier

xgb_params = {
                'objective': 'multi:softprob',
                'max_depth': 3,
                'eta': 0.1,
                'silent': 1,
                'min_child_weight': 1,
                'subsample': 0.8,
                'colsample_by_tree': 0.7,
                'seed': 2020,
                'num_rounds': 2000
}

All estimators within the `FeatureUnion` of  2 pipelines should be able to implement fit and transform method.XGBClassifier doesn't have a transform method. So it has to be added as part of the pipeline but outside the `FeatureUnion`

In [37]:
pipe7 = Pipeline([
                    ('u1', FeatureUnion([
                                            ('tfidf_features', Pipeline([
                                                                        ('cv', CountVectorizer()),
                                                                        ('tfidf', TfidfTransformer()),
                                                                        ('tfidf_logit', Classifierwrapper(LogisticRegression()))
                                                                        ])),
                                            ('pos_features', Pipeline([
                                                                        ('pos', posTagMatrix(tokenizer = nltk.word_tokenize)),
                                                                        ('pos_logit', Classifierwrapper(LogisticRegression()))
                                                                        ]))
                                        ])),
    
                    ('xgb', XGBClassifier(**xgb_params)),

                                        
        ])

In [38]:
pipe7.fit(X_train, y_train)
pred_val = pipe7.predict_proba(X_val)

print('Metric log loss', metrics.log_loss(y_val, pred_val))

Metric log loss 0.5661913624741971


### Submission

Now that we have create a pipeline it becomes easier to train the model on the full train set without data leakage to test

In [41]:
%%time

# fit the model again on the enire train set
pipe7.fit(train['text'], le.fit_transform(train['author']))

# predictions on the actual test
pred = pipe7.predict_proba(test['text'])

CPU times: user 1min 42s, sys: 1.26 s, total: 1min 44s
Wall time: 1min 32s


In [48]:
submission = pd.DataFrame(pred, columns = ['EAP','HPL', 'MWS'], index = test.id)
submission.to_csv('submission_pipelines.csv', index=id)

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.115672,0.007282,0.877046
id24541,0.987783,0.010618,0.001599
id00134,0.024235,0.970655,0.005111
id27757,0.948460,0.046909,0.004631
id04081,0.877755,0.014504,0.107741
...,...,...,...
id11749,0.952656,0.003652,0.043691
id10526,0.001740,0.000468,0.997792
id13477,0.990142,0.005216,0.004643
id13761,0.099599,0.007766,0.892635
