In [167]:
import gc
import numpy as np
import pandas as pd
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm
import lightgbm as lgb

df = pd.read_csv("regulation_cleaned.csv")

del df["label"]

df["class"] = df["class"].map(str.strip)  ## remove excess whitespace 

print(df["class"].value_counts().median()) ### 148 Perfect

## Makes sure that there are at least five samples per label
count_class = 5  ## all are above 5 in this case 
list_two = list(df["class"].value_counts()[df["class"].value_counts()>count_class].index.values)
df = df[df["class"].isin(list_two)].reset_index(drop=True)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['class'] = le.fit_transform(df['class'])  


148.0


I quickly created a test for punctuation, stop words, spelling, stemming, lemmatisation and combinations. Stem and lem together seems to lead to the best performance. All other combinations seem to be inferior. 

In [168]:
def test_preprocessing(data, punct=False, stop=False, stem=False, lem=False, stem_lem=False, spell_before=False, spell_after=False):
    import pandas as pd
    from bs4 import BeautifulSoup
    from nltk.corpus import stopwords
    import re
    import string
    from nltk.stem import PorterStemmer
    from nltk.stem import WordNetLemmatizer
    from autocorrect import spell

    def removePunctuation(x):
        # Lowercasing all words
        x = x.lower()
        # Removing non ASCII chars
        x = re.sub(r'[^\x00-\x7f]',r' ',x)
        # Removing (replacing with empty spaces actually) all the punctuations
        return re.sub("["+string.punctuation+"]", " ", x)

    stops = set(stopwords.words("english"))
    def removeStopwords(x):
        # Removing all the stopwords
        filtered_words = [word for word in x.split() if word not in stops]
        return " ".join(filtered_words)
    
    p = PorterStemmer()
    def Stemmer(sentence):
        return ' '.join([p.stem(x.lower()) for x in re.split('\W', sentence) if len(x) >= 1])

    L = WordNetLemmatizer()
    def Lemma(sentence):
        return ' '.join([L.lemmatize(x.lower()) for x in re.split('\W', sentence) if len(x) > 1])

    p = PorterStemmer()
    L = WordNetLemmatizer()
    def StemAndLem(sentence):
        return ' '.join([p.stem(L.lemmatize(x.lower())) for x in re.split('\W', sentence) if len(x) > 1])

    def Spelling(sentence):
        return ' '.join([spell(x.lower()) for x in re.split('\W', sentence) if len(x) > 1])

    if spell_before:
        data["documents"] = data["documents"].map(Spelling)
    if punct:
        data["documents"] = data["documents"].map(removePunctuation)
    if stop:
        data["documents"] = data["documents"].map(removeStopwords)
    if stem:
        data["documents"] = data["documents"].map(Stemmer)
    if lem:
        data["documents"] = data["documents"].map(Lemma)
    if stem_lem:
        data["documents"] = data["documents"].map(StemAndLem)
    if spell_after:
        data["documents"] = data["documents"].map(Spelling)
        
    from sklearn.model_selection import train_test_split

    train, test, y_train, y_test = train_test_split(data.drop(["class"],axis=1), data["class"], test_size=0.4, random_state=0, stratify= data["class"])

    print(train.head())
    # print(test.head())
    print(train.shape, test.shape)


    # Extract features
    def extract_features(data):
        data['documents_len'] = data['documents'].apply(lambda x: len(str(x)))
        data['documents_wc'] = data['documents'].apply(lambda x: len(str(x).split(' ')))

    extract_features(train)
    extract_features(test)


    df_all = pd.concat([train, test], axis=0)
    gc.collect()

    df_all.head()

    # Preprocess text
    print('Preprocessing text...')
    cols = [
        'documents', 
    ]
    n_features = [
        240, 
    ]

    for c_i, c in tqdm(enumerate(cols)):
        tfidf = TfidfVectorizer(
            max_features=n_features[c_i],
            norm='l2',
            )
        tfidf.fit(df_all[c])
        tfidf_train = np.array(tfidf.transform(train[c]).toarray(), dtype=np.float16)
        tfidf_test = np.array(tfidf.transform(test[c]).toarray(), dtype=np.float16)

        for i in range(n_features[c_i]):
            train[c + '_tfidf_' + str(i)] = tfidf_train[:, i]
            test[c + '_tfidf_' + str(i)] = tfidf_test[:, i]

        del tfidf, tfidf_train, tfidf_test
        gc.collect()

    print('Done.')
    del df_all
    gc.collect()

    # Prepare data

    cols_to_drop = [
        'class',
        'documents',
        'is_test',
        'index'
    ]
    X = train.drop(cols_to_drop, axis=1, errors='ignore').reset_index(drop=True)

    y = y_train.reset_index(drop=True)
    X_test = test.drop(cols_to_drop, axis=1, errors='ignore').reset_index(drop=True)

    ## drop is_test

    pd.options.display.max_columns = None

    X.head()

    print(len(y.unique()), len(y_test.unique()))

    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    import pandas as pd
    from sklearn.metrics import precision_recall_fscore_support


    id_test = test.index.values
    feature_names = list(X.columns)
    print(X.shape, X_test.shape)

    #del train, test
    gc.collect()

    # Build the model
    cnt = 0
    p_buf = []
    n_splits = 3
    n_repeats = 1
    kf = RepeatedKFold(
        n_splits=n_splits, 
        n_repeats=n_repeats, 
        random_state=0)
    auc_buf = []   

    for train_index, valid_index in kf.split(X):
        print('Fold {}/{}'.format(cnt + 1, n_splits))
        params = {
            'boosting_type': 'gbdt',
            'objective': 'multiclass',
            #'objective': 'multiclassova',
            'num_class': 46, #len(y.unique())
            # 'metric': ['multi_logloss'],
            'metric': ['multi_error'],## 
            'max_depth': 14,
            'num_leaves': 31,
            'learning_rate': 0.025,
            'feature_fraction': 0.85,
            'bagging_fraction': 0.85,
            'bagging_freq': 5,
            'verbose': 0,
            'num_threads': 1,
            'lambda_l2': 1.0,
            'min_gain_to_split': 0,
        }  

        lgb_train = lgb.Dataset(
            X.loc[train_index], 
            y.loc[train_index], 
            feature_name=feature_names,
            )
        lgb_train.raw_data = None

        lgb_valid = lgb.Dataset(
            X.loc[valid_index], 
            y.loc[valid_index],
            )
        lgb_valid.raw_data = None

        model = lgb.train(
            params,
            lgb_train,
            num_boost_round=10000,
            valid_sets=[lgb_train, lgb_valid],
            early_stopping_rounds=100,
            verbose_eval=100,
        )

        if cnt == 0:
            importance = model.feature_importance()
            model_fnames = model.feature_name()
            tuples = sorted(zip(model_fnames, importance), key=lambda x: x[1])[::-1]
            tuples = [x for x in tuples if x[1] > 0]
            print('Important features:')
            for i in range(60):
                if i < len(tuples):
                    print(tuples[i])
                else:
                    break

            del importance, model_fnames, tuples

        y_pred = model.predict(X.loc[valid_index], num_iteration=model.best_iteration)

        pred_class = [np.argmax(line) for line in y_pred]

        #cm = confusion_matrix(y.loc[valid_index], pred_class)

        cm = classification_report(y.loc[valid_index], pred_class, target_names=list(le.inverse_transform(y.loc[valid_index].sort_values().unique())))
        print(cm)

        report = pd.DataFrame(list(precision_recall_fscore_support(y.loc[valid_index], pred_class)),
                    index=['Precision', 'Recall', 'F1-score', 'Support']).T

        # Now add the 'Avg/Total' row
        report.loc['Avg/Total', :] = precision_recall_fscore_support(y.loc[valid_index], pred_class,
            average='weighted')
        report.loc['Avg/Total', 'Support'] = report['Support'].sum()

        p = model.predict(X_test, num_iteration=model.best_iteration)
        if len(p_buf) == 0:
            p_buf = np.array(p, dtype=np.float16)
        else:
            p_buf += np.array(p, dtype=np.float16)

        cnt += 1
        if cnt > 0: # Comment this to run several folds
            break

        del model, lgb_train, lgb_valid, p
        gc.collect

    preds = p_buf/cnt

    preds = [np.argmax(line) for line in preds]

    # Prepare submission
    subm = pd.DataFrame()
    subm['id'] = id_test
    subm['label'] = preds
    subm.to_csv('submission.csv', index=False)



    list_ind = list(le.inverse_transform(y.loc[valid_index].sort_values().unique()))

    list_ind.append("Avg/Total")

    report.index = list_ind

    return report


In [169]:
report_dict= {}
## spell takes very long, be cautious when running 
report_dict["normal"] = test_preprocessing(df)
report_dict["punct"] = test_preprocessing(df, punct=True)
report_dict["stop"] = test_preprocessing(df, stop=True)
report_dict["punct and stop"] = test_preprocessing(df, punct= True, stop=True)
report_dict["stem"] = test_preprocessing(df, stem= True)
report_dict["lem"] = test_preprocessing(df, lem= True)
report_dict["stem_lem"] = test_preprocessing(df, stem_lem = True)
report_dict["stop and stem"] = test_preprocessing(df, stem = True, stop=True)
report_dict["stem spell before"] = test_preprocessing(df, stem= True, spell_before=True)
report_dict["stem spell after"] = test_preprocessing(df, stem= True, spell_after=True)

### See lest detailed output in the next codeblock.

                                             documents
164  Governing payment institutions and registered ...
882               27 Dealings by Employees of a Dealer
846  Directive 2006/43/EC of the European Parliamen...
824                                          Murabahah
481  Royal Decree Regulating on Electronic Payment ...
(739, 1) (494, 1)


0it [00:00, ?it/s]

Preprocessing text...


1it [00:00,  1.34it/s]


Done.
6 6
(739, 242) (494, 242)
Fold 1/3
Training until validation scores don't improve for 100 rounds.
[100]	training's multi_error: 0.252033	valid_1's multi_error: 0.392713
[200]	training's multi_error: 0.182927	valid_1's multi_error: 0.392713
[300]	training's multi_error: 0.132114	valid_1's multi_error: 0.384615
Early stopping, best iteration is:
[229]	training's multi_error: 0.164634	valid_1's multi_error: 0.384615
Important features:
('documents_len', 4999)
('documents_wc', 2058)
('documents_tfidf_161', 1483)
('documents_tfidf_163', 1309)
('documents_tfidf_32', 1057)
('documents_tfidf_26', 814)
('documents_tfidf_102', 725)
('documents_tfidf_159', 608)
('documents_tfidf_228', 450)
('documents_tfidf_229', 429)
('documents_tfidf_192', 346)
('documents_tfidf_208', 295)
('documents_tfidf_118', 262)
('documents_tfidf_127', 260)
('documents_tfidf_176', 221)
('documents_tfidf_130', 202)
('documents_tfidf_48', 155)
('documents_tfidf_99', 139)
('documents_tfidf_172', 138)
('documents_tfidf_

  if diff:


             precision    recall  f1-score   support

    banking       0.64      0.78      0.71       100
  companies       0.42      0.36      0.39        36
      funds       0.41      0.28      0.33        25
  insurance       0.69      0.66      0.67        44
    payment       0.78      0.44      0.56        16
 securities       0.67      0.69      0.68        26

avg / total       0.61      0.62      0.60       247



  if diff:


                                             documents
164  governing payment institutions and registered ...
882               27 dealings by employees of a dealer
846  directive 2006 43 ec of the european parliamen...
824                                          murabahah
481  royal decree regulating on electronic payment ...
(739, 1) (494, 1)


0it [00:00, ?it/s]

Preprocessing text...


1it [00:00,  1.49it/s]


Done.
6 6
(739, 242) (494, 242)
Fold 1/3
Training until validation scores don't improve for 100 rounds.
[100]	training's multi_error: 0.25813	valid_1's multi_error: 0.404858
[200]	training's multi_error: 0.182927	valid_1's multi_error: 0.396761
[300]	training's multi_error: 0.130081	valid_1's multi_error: 0.380567
Early stopping, best iteration is:
[263]	training's multi_error: 0.148374	valid_1's multi_error: 0.376518
Important features:
('documents_len', 5413)
('documents_wc', 3347)
('documents_tfidf_161', 1631)
('documents_tfidf_163', 1380)
('documents_tfidf_31', 1196)
('documents_tfidf_25', 824)
('documents_tfidf_102', 752)
('documents_tfidf_159', 650)
('documents_tfidf_228', 518)
('documents_tfidf_229', 487)
('documents_tfidf_192', 329)
('documents_tfidf_208', 317)
('documents_tfidf_118', 292)
('documents_tfidf_127', 283)
('documents_tfidf_176', 233)
('documents_tfidf_130', 221)
('documents_tfidf_48', 193)
('documents_tfidf_99', 161)
('documents_tfidf_126', 160)
('documents_tfidf_1

  if diff:


             precision    recall  f1-score   support

    banking       0.66      0.79      0.72       100
  companies       0.44      0.42      0.43        36
      funds       0.40      0.24      0.30        25
  insurance       0.68      0.64      0.66        44
    payment       0.78      0.44      0.56        16
 securities       0.66      0.73      0.69        26

avg / total       0.61      0.62      0.61       247



  if diff:


                                             documents
164  governing payment institutions registered paym...
882                       27 dealings employees dealer
846  directive 2006 43 ec european parliament counc...
824                                          murabahah
481  royal decree regulating electronic payment ser...
(739, 1)

0it [00:00, ?it/s]

 (494, 1)
Preprocessing text...


1it [00:00,  1.30it/s]


Done.
6 6
(739, 242) (494, 242)
Fold 1/3
Training until validation scores don't improve for 100 rounds.
[100]	training's multi_error: 0.286585	valid_1's multi_error: 0.417004
Early stopping, best iteration is:
[7]	training's multi_error: 0.424797	valid_1's multi_error: 0.404858
Important features:
('documents_len', 137)
('documents_wc', 79)
('documents_tfidf_28', 39)
('documents_tfidf_163', 26)
('documents_tfidf_177', 21)
('documents_tfidf_212', 15)
('documents_tfidf_133', 14)
('documents_tfidf_130', 14)
('documents_tfidf_122', 13)
('documents_tfidf_194', 10)
('documents_tfidf_48', 7)
('documents_tfidf_129', 5)
('documents_tfidf_103', 4)
('documents_tfidf_49', 1)
             precision    recall  f1-score   support

    banking       0.54      0.91      0.68       100
  companies       0.55      0.31      0.39        36
      funds       0.56      0.20      0.29        25
  insurance       0.93      0.57      0.70        44
    payment       0.00      0.00      0.00        16
 securiti

  if diff:
  'precision', 'predicted', average, warn_for)
  if diff:


                                             documents
164  governing payment institutions registered paym...
882                       27 dealings employees dealer
846  directive 2006 43 ec european parliament counc...
824                                          murabahah
481  royal decree regulating electronic payment ser...
(739, 1) (494, 1)

0it [00:00, ?it/s]


Preprocessing text...


1it [00:00,  2.00it/s]


Done.
6 6
(739, 242) (494, 242)
Fold 1/3
Training until validation scores don't improve for 100 rounds.
[100]	training's multi_error: 0.286585	valid_1's multi_error: 0.417004
Early stopping, best iteration is:
[7]	training's multi_error: 0.424797	valid_1's multi_error: 0.404858
Important features:
('documents_len', 137)
('documents_wc', 79)
('documents_tfidf_28', 39)
('documents_tfidf_163', 26)
('documents_tfidf_177', 21)
('documents_tfidf_212', 15)
('documents_tfidf_133', 14)
('documents_tfidf_130', 14)
('documents_tfidf_122', 13)
('documents_tfidf_194', 10)
('documents_tfidf_48', 7)
('documents_tfidf_129', 5)
('documents_tfidf_103', 4)
('documents_tfidf_49', 1)
             precision    recall  f1-score   support

    banking       0.54      0.91      0.68       100
  companies       0.55      0.31      0.39        36
      funds       0.56      0.20      0.29        25
  insurance       0.93      0.57      0.70        44
    payment       0.00      0.00      0.00        16
 securiti

  if diff:
  if diff:


                                             documents
164  govern payment institut regist payment servic ...
882                             27 deal employe dealer
846  direct 2006 43 ec european parliament council ...
824                                          murabahah
481  royal decre regul electron payment servic spec...
(739, 1) (494, 1)


0it [00:00, ?it/s]

Preprocessing text...


1it [00:00,  2.09it/s]


Done.
6 6
(739, 242) (494, 242)
Fold 1/3
Training until validation scores don't improve for 100 rounds.
[100]	training's multi_error: 0.243902	valid_1's multi_error: 0.340081
Early stopping, best iteration is:
[40]	training's multi_error: 0.292683	valid_1's multi_error: 0.327935
Important features:
('documents_len', 924)
('documents_wc', 307)
('documents_tfidf_158', 121)
('documents_tfidf_113', 115)
('documents_tfidf_27', 114)
('documents_tfidf_194', 90)
('documents_tfidf_210', 81)
('documents_tfidf_122', 76)
('documents_tfidf_45', 60)
('documents_tfidf_174', 59)
('documents_tfidf_125', 44)
('documents_tfidf_162', 40)
('documents_tfidf_106', 39)
('documents_tfidf_100', 33)
('documents_tfidf_66', 31)
('documents_tfidf_169', 22)
('documents_tfidf_120', 9)
('documents_tfidf_83', 9)
('documents_tfidf_31', 2)
             precision    recall  f1-score   support

    banking       0.63      0.87      0.73       100
  companies       0.67      0.39      0.49        36
      funds       0.60  

  if diff:
  if diff:


                                             documents
164  govern payment institut regist payment servic ...
882                             27 deal employe dealer
846  direct 2006 43 ec european parliament council ...
824                                          murabahah
481  royal decre regul electron payment servic spec...
(739, 1) (494, 1)


0it [00:00, ?it/s]

Preprocessing text...


1it [00:00,  1.97it/s]


Done.
6 6
(739, 242) (494, 242)
Fold 1/3
Training until validation scores don't improve for 100 rounds.
[100]	training's multi_error: 0.252033	valid_1's multi_error: 0.34413
Early stopping, best iteration is:
[28]	training's multi_error: 0.29878	valid_1's multi_error: 0.331984
Important features:
('documents_len', 546)
('documents_wc', 194)
('documents_tfidf_28', 93)
('documents_tfidf_159', 91)
('documents_tfidf_114', 75)
('documents_tfidf_194', 64)
('documents_tfidf_124', 63)
('documents_tfidf_208', 54)
('documents_tfidf_174', 50)
('documents_tfidf_46', 40)
('documents_tfidf_127', 33)
('documents_tfidf_163', 28)
('documents_tfidf_107', 27)
('documents_tfidf_169', 23)
('documents_tfidf_101', 23)
('documents_tfidf_66', 23)
('documents_tfidf_122', 6)
('documents_tfidf_83', 1)


  if diff:
  if diff:


             precision    recall  f1-score   support

    banking       0.61      0.86      0.71       100
  companies       0.64      0.39      0.48        36
      funds       0.67      0.48      0.56        25
  insurance       0.91      0.68      0.78        44
    payment       0.67      0.38      0.48        16
 securities       0.71      0.65      0.68        26

avg / total       0.69      0.67      0.66       247

                                             documents
164  govern payment institut regist payment servic ...
882                              27 deal employ dealer
846  direct 2006 43 ec european parliament council ...
824                                          murabahah
481  royal decr regul electron payment servic speci...
(739, 1) (494, 1)


0it [00:00, ?it/s]

Preprocessing text...


1it [00:00,  2.12it/s]


Done.
6 6
(739, 242) (494, 242)
Fold 1/3
Training until validation scores don't improve for 100 rounds.
[100]	training's multi_error: 0.247967	valid_1's multi_error: 0.331984
[200]	training's multi_error: 0.211382	valid_1's multi_error: 0.319838
Early stopping, best iteration is:
[199]	training's multi_error: 0.213415	valid_1's multi_error: 0.315789
Important features:
('documents_len', 5275)
('documents_wc', 1858)
('documents_tfidf_27', 722)
('documents_tfidf_157', 703)
('documents_tfidf_113', 507)
('documents_tfidf_194', 497)
('documents_tfidf_208', 362)
('documents_tfidf_100', 352)
('documents_tfidf_45', 331)
('documents_tfidf_174', 316)
('documents_tfidf_122', 276)
('documents_tfidf_125', 263)
('documents_tfidf_106', 195)
('documents_tfidf_161', 188)
('documents_tfidf_121', 158)
('documents_tfidf_66', 144)
('documents_tfidf_168', 141)
('documents_tfidf_84', 84)
('documents_tfidf_77', 79)
('documents_tfidf_32', 77)
('documents_tfidf_226', 12)
('documents_tfidf_141', 10)
('documents_

  if diff:
  if diff:


                                             documents
164  govern payment institut regist payment servic ...
882                              27 deal employ dealer
846  direct 2006 43 ec european parliament council ...
824                                          murabahah
481  royal decr regul electron payment servic speci...
(739, 1) (494, 1)


0it [00:00, ?it/s]

Preprocessing text...


1it [00:00,  2.23it/s]


Done.
6 6
(739, 242) (494, 242)
Fold 1/3
Training until validation scores don't improve for 100 rounds.
[100]	training's multi_error: 0.25813	valid_1's multi_error: 0.348178
[200]	training's multi_error: 0.211382	valid_1's multi_error: 0.327935
Early stopping, best iteration is:
[197]	training's multi_error: 0.215447	valid_1's multi_error: 0.327935
Important features:
('documents_len', 5344)
('documents_wc', 1726)
('documents_tfidf_27', 731)
('documents_tfidf_156', 682)
('documents_tfidf_112', 501)
('documents_tfidf_191', 443)
('documents_tfidf_207', 373)
('documents_tfidf_99', 328)
('documents_tfidf_43', 328)
('documents_tfidf_171', 323)
('documents_tfidf_124', 279)
('documents_tfidf_121', 278)
('documents_tfidf_105', 186)
('documents_tfidf_160', 184)
('documents_tfidf_119', 163)
('documents_tfidf_65', 151)
('documents_tfidf_166', 137)
('documents_tfidf_82', 90)
('documents_tfidf_31', 72)
('documents_tfidf_75', 64)
('documents_tfidf_225', 10)
('documents_tfidf_194', 4)
('documents_tfi

  if diff:
  if diff:


                                             documents
164  govern payment institut resist payment servic ...
882                              of deal employ dealer
846  direct 2006 of ec european parliament council ...
824                                          murabahah
481  royal dear regal electron payment servic speci...
(739, 1) (494, 1)


0it [00:00, ?it/s]

Preprocessing text...


1it [00:00,  1.93it/s]


Done.
6 6
(739, 242) (494, 242)
Fold 1/3
Training until validation scores don't improve for 100 rounds.
[100]	training's multi_error: 0.237805	valid_1's multi_error: 0.352227
Early stopping, best iteration is:
[17]	training's multi_error: 0.317073	valid_1's multi_error: 0.336032
Important features:
('documents_len', 316)
('documents_wc', 90)
('documents_tfidf_152', 54)
('documents_tfidf_105', 52)
('documents_tfidf_20', 52)
('documents_tfidf_186', 35)
('documents_tfidf_204', 32)
('documents_tfidf_166', 32)
('documents_tfidf_116', 30)
('documents_tfidf_119', 25)
('documents_tfidf_92', 21)
('documents_tfidf_153', 20)
('documents_tfidf_37', 20)
('documents_tfidf_98', 19)
('documents_tfidf_163', 15)
('documents_tfidf_156', 14)
('documents_tfidf_57', 14)
('documents_tfidf_114', 4)
('documents_tfidf_68', 1)
             precision    recall  f1-score   support

    banking       0.60      0.89      0.72       100
  companies       0.72      0.36      0.48        36
      funds       0.65      

  if diff:
  if diff:


                                             documents
164  govern payment institute resist payment servic...
882                              of deal employ dealer
846  direct 2006 of ec european parliament council ...
824                                          murabahah
481  royal dear regal electron payment service spec...
(739, 1) (494, 1)


0it [00:00, ?it/s]

Preprocessing text...


1it [00:00,  2.11it/s]


Done.
6 6
(739, 242) (494, 242)
Fold 1/3
Training until validation scores don't improve for 100 rounds.
[100]	training's multi_error: 0.243902	valid_1's multi_error: 0.348178
Early stopping, best iteration is:
[17]	training's multi_error: 0.313008	valid_1's multi_error: 0.336032
Important features:
('documents_len', 322)
('documents_wc', 98)
('documents_tfidf_152', 55)
('documents_tfidf_104', 53)
('documents_tfidf_20', 53)
('documents_tfidf_204', 33)
('documents_tfidf_186', 33)
('documents_tfidf_115', 32)
('documents_tfidf_166', 30)
('documents_tfidf_118', 25)
('documents_tfidf_91', 22)
('documents_tfidf_37', 20)
('documents_tfidf_153', 19)
('documents_tfidf_97', 18)
('documents_tfidf_163', 15)
('documents_tfidf_156', 14)
('documents_tfidf_58', 14)
('documents_tfidf_113', 3)
('documents_tfidf_68', 1)
             precision    recall  f1-score   support

    banking       0.60      0.89      0.72       100
  companies       0.72      0.36      0.48        36
      funds       0.61      

  if diff:
  if diff:


In [171]:
report_dict["stem_lem_stop"] = test_preprocessing(df, stem_lem= True, stop=True)
report_dict["stem_lem_punct"] = test_preprocessing(df, stem_lem= True, punct=True)

                                             documents
164  govern payment institut resist payment servic ...
882                                 deal employ dealer
846  direct 2006 ec european parliament council may...
824                                          murabahah
481  royal dear regal electron payment servic speci...
(739, 1) (494, 1)


0it [00:00, ?it/s]

Preprocessing text...


1it [00:00,  3.05it/s]


Done.
6 6
(739, 242) (494, 242)
Fold 1/3
Training until validation scores don't improve for 100 rounds.
[100]	training's multi_error: 0.25	valid_1's multi_error: 0.331984
Early stopping, best iteration is:
[29]	training's multi_error: 0.284553	valid_1's multi_error: 0.323887
Important features:
('documents_len', 682)
('documents_wc', 195)
('documents_tfidf_20', 106)
('documents_tfidf_153', 92)
('documents_tfidf_106', 81)
('documents_tfidf_117', 57)
('documents_tfidf_187', 54)
('documents_tfidf_167', 52)
('documents_tfidf_38', 50)
('documents_tfidf_204', 42)
('documents_tfidf_120', 41)
('documents_tfidf_156', 29)
('documents_tfidf_58', 28)
('documents_tfidf_93', 27)
('documents_tfidf_99', 26)
('documents_tfidf_163', 19)
('documents_tfidf_115', 11)
('documents_tfidf_26', 7)
             precision    recall  f1-score   support

    banking       0.62      0.90      0.73       100
  companies       0.67      0.33      0.44        36
      funds       0.63      0.48      0.55        25
  in

  if diff:
  if diff:


                                             documents
164  govern payment institut resist payment servic ...
882                                 deal employ dealer
846  direct 2006 ec european parliament council may...
824                                          murabahah
481  royal dear regal electron payment servic speci...
(739, 1) (494, 1)


0it [00:00, ?it/s]

Preprocessing text...


1it [00:00,  3.24it/s]


Done.
6 6
(739, 242) (494, 242)
Fold 1/3
Training until validation scores don't improve for 100 rounds.
[100]	training's multi_error: 0.25	valid_1's multi_error: 0.331984
Early stopping, best iteration is:
[19]	training's multi_error: 0.300813	valid_1's multi_error: 0.327935
Important features:
('documents_len', 436)
('documents_wc', 132)
('documents_tfidf_20', 70)
('documents_tfidf_153', 62)
('documents_tfidf_106', 52)
('documents_tfidf_117', 38)
('documents_tfidf_167', 36)
('documents_tfidf_38', 29)
('documents_tfidf_187', 26)
('documents_tfidf_120', 25)
('documents_tfidf_204', 24)
('documents_tfidf_99', 19)
('documents_tfidf_58', 19)
('documents_tfidf_156', 18)
('documents_tfidf_93', 17)
('documents_tfidf_163', 14)
('documents_tfidf_115', 8)
('documents_tfidf_26', 6)
             precision    recall  f1-score   support

    banking       0.62      0.89      0.73       100
  companies       0.67      0.33      0.44        36
      funds       0.57      0.48      0.52        25
  insu

  if diff:
  if diff:


In [172]:
### For this dataset a simple stem and lem performs the best. 

for key in report_dict.keys():
    print(key,":")
    print("_________________________")
    print(report_dict[key].iloc[-1,:])
    print("=========================")

normal :
_________________________
Precision      0.607338
Recall         0.615385
F1-score       0.603994
Support      247.000000
Name: Avg/Total, dtype: float64
punct :
_________________________
Precision      0.614561
Recall         0.623482
F1-score       0.611282
Support      247.000000
Name: Avg/Total, dtype: float64
stop :
_________________________
Precision      0.591105
Recall         0.595142
F1-score       0.552185
Support      247.000000
Name: Avg/Total, dtype: float64
punct and stop :
_________________________
Precision      0.591105
Recall         0.595142
F1-score       0.552185
Support      247.000000
Name: Avg/Total, dtype: float64
stem :
_________________________
Precision      0.686222
Recall         0.672065
F1-score       0.660029
Support      247.000000
Name: Avg/Total, dtype: float64
lem :
_________________________
Precision      0.686850
Recall         0.668016
F1-score       0.657279
Support      247.000000
Name: Avg/Total, dtype: float64
stem_lem :
___________

In [173]:
## Full Analysis  
for key in report_dict.keys():
    print(key,":")
    print("_________________________")
    print(report_dict[key])
    print("=========================")

normal :
_________________________
            Precision    Recall  F1-score  Support
banking      0.644628  0.780000  0.705882    100.0
companies    0.419355  0.361111  0.388060     36.0
funds        0.411765  0.280000  0.333333     25.0
insurance    0.690476  0.659091  0.674419     44.0
payment      0.777778  0.437500  0.560000     16.0
securities   0.666667  0.692308  0.679245     26.0
Avg/Total    0.607338  0.615385  0.603994    247.0
punct :
_________________________
            Precision    Recall  F1-score  Support
banking      0.663866  0.790000  0.721461    100.0
companies    0.441176  0.416667  0.428571     36.0
funds        0.400000  0.240000  0.300000     25.0
insurance    0.682927  0.636364  0.658824     44.0
payment      0.777778  0.437500  0.560000     16.0
securities   0.655172  0.730769  0.690909     26.0
Avg/Total    0.614561  0.623482  0.611282    247.0
stop :
_________________________
            Precision    Recall  F1-score  Support
banking      0.538462  0.910000