In [5]:
import gc
import numpy as np
import pandas as pd
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm
import lightgbm as lgb

df = pd.read_csv("regulation_cleaned.csv")

y = "class"
del df["label"]

df["class"] = df["class"].map(str.strip)  ## remove excess whitespace 

print(df["class"].value_counts().median()) ### 148 Perfect

## Makes sure that there are at least five samples per label
count_class = 5  ## all are above 5 in this case 
list_two = list(df["class"].value_counts()[df["class"].value_counts()>count_class].index.values)
df = df[df["class"].isin(list_two)].reset_index(drop=True)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['class'] = le.fit_transform(df['class'])  


148.0


I quickly created a test for punctuation, stop words, spelling, stemming, lemmatisation and combinations. Stem and lem together seems to lead to the best performance. All other combinations seem to be inferior. 

In [32]:
### y_test is still kept save for later generalised performance. 

def test_preprocessing(data, param, df_return=False, n_feat=240, punct=False, stop=False, stem=False, lem=False, stem_lem=False, spell_before=False, spell_after=False):
    import pandas as pd
    from bs4 import BeautifulSoup
    from nltk.corpus import stopwords
    import re
    import string
    from nltk.stem import PorterStemmer
    from nltk.stem import WordNetLemmatizer
    from autocorrect import spell

    def removePunctuation(x):
        # Lowercasing all words
        x = x.lower()
        # Removing non ASCII chars
        x = re.sub(r'[^\x00-\x7f]',r' ',x)
        # Removing (replacing with empty spaces actually) all the punctuations
        return re.sub("["+string.punctuation+"]", " ", x)

    stops = set(stopwords.words("english"))
    def removeStopwords(x):
        # Removing all the stopwords
        filtered_words = [word for word in x.split() if word not in stops]
        return " ".join(filtered_words)
    
    p = PorterStemmer()
    def Stemmer(sentence):
        return ' '.join([p.stem(x.lower()) for x in re.split('\W', sentence) if len(x) >= 1])

    L = WordNetLemmatizer()
    def Lemma(sentence):
        return ' '.join([L.lemmatize(x.lower()) for x in re.split('\W', sentence) if len(x) > 1])

    p = PorterStemmer()
    L = WordNetLemmatizer()
    def StemAndLem(sentence):
        return ' '.join([p.stem(L.lemmatize(x.lower())) for x in re.split('\W', sentence) if len(x) > 1])

    def Spelling(sentence):
        return ' '.join([spell(x.lower()) for x in re.split('\W', sentence) if len(x) > 1])

    if spell_before:
        data["documents"] = data["documents"].map(Spelling)
    if punct:
        data["documents"] = data["documents"].map(removePunctuation)
    if stop:
        data["documents"] = data["documents"].map(removeStopwords)
    if stem:
        data["documents"] = data["documents"].map(Stemmer)
    if lem:
        data["documents"] = data["documents"].map(Lemma)
    if stem_lem:
        data["documents"] = data["documents"].map(StemAndLem)
    if spell_after:
        data["documents"] = data["documents"].map(Spelling)
        
    from sklearn.model_selection import train_test_split

    train, test, y_train, y_test = train_test_split(data.drop(["class"],axis=1), data["class"], test_size=0.4, random_state=0, stratify= data["class"])

    print(train.head())
    # print(test.head())
    print(train.shape, test.shape)


    # Extract features
    def extract_features(data):
        data['documents_len'] = data['documents'].apply(lambda x: len(str(x)))
        data['documents_wc'] = data['documents'].apply(lambda x: len(str(x).split(' ')))

    extract_features(train)
    extract_features(test)

    
    df_all = pd.concat([train, test], axis=0)
    gc.collect()

    df_all.head()

    # Preprocess text
    print('Preprocessing text...')
    cols = [
        'documents', 
    ]
    n_features = [
        n_feat, 
    ]

    for c_i, c in tqdm(enumerate(cols)):
        tfidf = TfidfVectorizer(
            max_features=n_features[c_i],
            norm='l2',
            )
        tfidf.fit(df_all[c])
        tfidf_train = np.array(tfidf.transform(train[c]).toarray(), dtype=np.float16)
        tfidf_test = np.array(tfidf.transform(test[c]).toarray(), dtype=np.float16)

        for i in range(n_features[c_i]):
            train[c + '_tfidf_' + str(i)] = tfidf_train[:, i]
            test[c + '_tfidf_' + str(i)] = tfidf_test[:, i]

        del tfidf, tfidf_train, tfidf_test
        gc.collect()

    print('Done.')
    del df_all
    gc.collect()

    # Prepare data

    cols_to_drop = [
        'class',
        'documents',
        'is_test',
        'index'
    ]
    X = train.drop(cols_to_drop, axis=1, errors='ignore').reset_index(drop=True)

    y = y_train.reset_index(drop=True)
    X_test = test.drop(cols_to_drop, axis=1, errors='ignore').reset_index(drop=True)

    ## drop is_test

    pd.options.display.max_columns = None
    
    if df_return:
        return X, y, X_test 

    X.head()

    print(len(y.unique()), len(y_test.unique()))

    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    import pandas as pd
    from sklearn.metrics import precision_recall_fscore_support


    id_test = test.index.values
    feature_names = list(X.columns)
    print(X.shape, X_test.shape)

    #del train, test
    gc.collect()

    # Build the model
    cnt = 0
    p_buf = []
    n_splits = 3
    n_repeats = 1
    kf = RepeatedKFold(
        n_splits=n_splits, 
        n_repeats=n_repeats, 
        random_state=0)
    auc_buf = []   

    for train_index, valid_index in kf.split(X):
        print('Fold {}/{}'.format(cnt + 1, n_splits))

        lgb_train = lgb.Dataset(
            X.loc[train_index], 
            y.loc[train_index], 
            feature_name=feature_names,
            )
        lgb_train.raw_data = None

        lgb_valid = lgb.Dataset(
            X.loc[valid_index], 
            y.loc[valid_index],
            )
        lgb_valid.raw_data = None

        model = lgb.train(
            param,
            lgb_train,
            num_boost_round=10000,
            valid_sets=[lgb_train, lgb_valid],
            early_stopping_rounds=100,
            verbose_eval=100,
        )

        if cnt == 0:
            importance = model.feature_importance()
            model_fnames = model.feature_name()
            tuples = sorted(zip(model_fnames, importance), key=lambda x: x[1])[::-1]
            tuples = [x for x in tuples if x[1] > 0]
            print('Important features:')
            for i in range(60):
                if i < len(tuples):
                    print(tuples[i])
                else:
                    break

            del importance, model_fnames, tuples

        y_pred = model.predict(X.loc[valid_index], num_iteration=model.best_iteration)

        pred_class = [np.argmax(line) for line in y_pred]

        #cm = confusion_matrix(y.loc[valid_index], pred_class)

        cm = classification_report(y.loc[valid_index], pred_class, target_names=list(le.inverse_transform(y.loc[valid_index].sort_values().unique())))
        print(cm)

        report = pd.DataFrame(list(precision_recall_fscore_support(y.loc[valid_index], pred_class)),
                    index=['Precision', 'Recall', 'F1-score', 'Support']).T

        # Now add the 'Avg/Total' row
        report.loc['Avg/Total', :] = precision_recall_fscore_support(y.loc[valid_index], pred_class,
            average='weighted')
        report.loc['Avg/Total', 'Support'] = report['Support'].sum()

        p = model.predict(X_test, num_iteration=model.best_iteration)
        if len(p_buf) == 0:
            p_buf = np.array(p, dtype=np.float16)
        else:
            p_buf += np.array(p, dtype=np.float16)

        cnt += 1
        if cnt > 0: # Comment this to run several folds
            break

        del model, lgb_train, lgb_valid, p
        gc.collect

    preds = p_buf/cnt

    preds = [np.argmax(line) for line in preds]

    # Prepare submission
    subm = pd.DataFrame()
    subm['id'] = id_test
    subm['label'] = preds
    subm.to_csv('submission.csv', index=False)



    list_ind = list(le.inverse_transform(y.loc[valid_index].sort_values().unique()))

    list_ind.append("Avg/Total")

    report.index = list_ind
    
    return report


In [41]:
sp_randint(100, 500)

<scipy.stats._distn_infrastructure.rv_frozen at 0x10d64bcf8>

In [76]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


param_test ={ ### This top list is here for varying parameters
             'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'feature_fraction': [0.75, 0.85,0.95],
             'bagging_fraction': [0.75, 0.85,0.95],
             
            
              ### The following variables are unchanging
            'boosting_type': ['gbdt'],
            'num_class': [len(df[y].unique())],
            'objective': ['multiclass'],
            'metric': ['multi_error'],
            'learning_rate': [0.025],
            'min_data':[1]

            }

#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 10


#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 1000 define only the absolute maximum
clf = lgb.LGBMClassifier(random_state=314, n_jobs=6, n_estimators=1000)
gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=n_HP_points_to_test,
    cv=3,
    random_state=314,
    verbose=True)

X_train, y_train, X_test = test_preprocessing(df,parama,n_feat=240, df_return=True, stem_lem = True)

                                             documents
164  govern payment institut and regist payment ser...
882                        27 deal by employ of dealer
846  direct 2006 43 ec of the european parliament a...
824                                          murabahah
481  royal decr regul on electron payment servic of...
(739, 1) (494, 1)


0it [00:00, ?it/s]

Preprocessing text...


1it [00:00,  3.14it/s]

Done.





In [70]:
X_train.head()

Unnamed: 0,documents_len,documents_wc,documents_tfidf_0,documents_tfidf_1,documents_tfidf_2,documents_tfidf_3,documents_tfidf_4,documents_tfidf_5,documents_tfidf_6,documents_tfidf_7,documents_tfidf_8,documents_tfidf_9,documents_tfidf_10,documents_tfidf_11,documents_tfidf_12,documents_tfidf_13,documents_tfidf_14,documents_tfidf_15,documents_tfidf_16,documents_tfidf_17,documents_tfidf_18,documents_tfidf_19,documents_tfidf_20,documents_tfidf_21,documents_tfidf_22,documents_tfidf_23,documents_tfidf_24,documents_tfidf_25,documents_tfidf_26,documents_tfidf_27,documents_tfidf_28,documents_tfidf_29,documents_tfidf_30,documents_tfidf_31,documents_tfidf_32,documents_tfidf_33,documents_tfidf_34,documents_tfidf_35,documents_tfidf_36,documents_tfidf_37,documents_tfidf_38,documents_tfidf_39,documents_tfidf_40,documents_tfidf_41,documents_tfidf_42,documents_tfidf_43,documents_tfidf_44,documents_tfidf_45,documents_tfidf_46,documents_tfidf_47,documents_tfidf_48,documents_tfidf_49,documents_tfidf_50,documents_tfidf_51,documents_tfidf_52,documents_tfidf_53,documents_tfidf_54,documents_tfidf_55,documents_tfidf_56,documents_tfidf_57,documents_tfidf_58,documents_tfidf_59,documents_tfidf_60,documents_tfidf_61,documents_tfidf_62,documents_tfidf_63,documents_tfidf_64,documents_tfidf_65,documents_tfidf_66,documents_tfidf_67,documents_tfidf_68,documents_tfidf_69,documents_tfidf_70,documents_tfidf_71,documents_tfidf_72,documents_tfidf_73,documents_tfidf_74,documents_tfidf_75,documents_tfidf_76,documents_tfidf_77,documents_tfidf_78,documents_tfidf_79,documents_tfidf_80,documents_tfidf_81,documents_tfidf_82,documents_tfidf_83,documents_tfidf_84,documents_tfidf_85,documents_tfidf_86,documents_tfidf_87,documents_tfidf_88,documents_tfidf_89,documents_tfidf_90,documents_tfidf_91,documents_tfidf_92,documents_tfidf_93,documents_tfidf_94,documents_tfidf_95,documents_tfidf_96,documents_tfidf_97,documents_tfidf_98,documents_tfidf_99,documents_tfidf_100,documents_tfidf_101,documents_tfidf_102,documents_tfidf_103,documents_tfidf_104,documents_tfidf_105,documents_tfidf_106,documents_tfidf_107,documents_tfidf_108,documents_tfidf_109,documents_tfidf_110,documents_tfidf_111,documents_tfidf_112,documents_tfidf_113,documents_tfidf_114,documents_tfidf_115,documents_tfidf_116,documents_tfidf_117,documents_tfidf_118,documents_tfidf_119,documents_tfidf_120,documents_tfidf_121,documents_tfidf_122,documents_tfidf_123,documents_tfidf_124,documents_tfidf_125,documents_tfidf_126,documents_tfidf_127,documents_tfidf_128,documents_tfidf_129,documents_tfidf_130,documents_tfidf_131,documents_tfidf_132,documents_tfidf_133,documents_tfidf_134,documents_tfidf_135,documents_tfidf_136,documents_tfidf_137,documents_tfidf_138,documents_tfidf_139,documents_tfidf_140,documents_tfidf_141,documents_tfidf_142,documents_tfidf_143,documents_tfidf_144,documents_tfidf_145,documents_tfidf_146,documents_tfidf_147,documents_tfidf_148,documents_tfidf_149,documents_tfidf_150,documents_tfidf_151,documents_tfidf_152,documents_tfidf_153,documents_tfidf_154,documents_tfidf_155,documents_tfidf_156,documents_tfidf_157,documents_tfidf_158,documents_tfidf_159,documents_tfidf_160,documents_tfidf_161,documents_tfidf_162,documents_tfidf_163,documents_tfidf_164,documents_tfidf_165,documents_tfidf_166,documents_tfidf_167,documents_tfidf_168,documents_tfidf_169,documents_tfidf_170,documents_tfidf_171,documents_tfidf_172,documents_tfidf_173,documents_tfidf_174,documents_tfidf_175,documents_tfidf_176,documents_tfidf_177,documents_tfidf_178,documents_tfidf_179,documents_tfidf_180,documents_tfidf_181,documents_tfidf_182,documents_tfidf_183,documents_tfidf_184,documents_tfidf_185,documents_tfidf_186,documents_tfidf_187,documents_tfidf_188,documents_tfidf_189,documents_tfidf_190,documents_tfidf_191,documents_tfidf_192,documents_tfidf_193,documents_tfidf_194,documents_tfidf_195,documents_tfidf_196,documents_tfidf_197,documents_tfidf_198,documents_tfidf_199,documents_tfidf_200,documents_tfidf_201,documents_tfidf_202,documents_tfidf_203,documents_tfidf_204,documents_tfidf_205,documents_tfidf_206,documents_tfidf_207,documents_tfidf_208,documents_tfidf_209,documents_tfidf_210,documents_tfidf_211,documents_tfidf_212,documents_tfidf_213,documents_tfidf_214,documents_tfidf_215,documents_tfidf_216,documents_tfidf_217,documents_tfidf_218,documents_tfidf_219,documents_tfidf_220,documents_tfidf_221,documents_tfidf_222,documents_tfidf_223,documents_tfidf_224,documents_tfidf_225,documents_tfidf_226,documents_tfidf_227,documents_tfidf_228,documents_tfidf_229,documents_tfidf_230,documents_tfidf_231,documents_tfidf_232,documents_tfidf_233,documents_tfidf_234,documents_tfidf_235,documents_tfidf_236,documents_tfidf_237,documents_tfidf_238,documents_tfidf_239
0,56,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.330566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.259277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.606445,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.388428,0.0,0.0,0.0,0.0,0.0,0.0,0.416748,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,27,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.600586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.757812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.254883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,220,42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.296387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.279053,0.0,0.0,0.0,0.0,0.107361,0.0,0.249023,0.139526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.453613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.343506,0.0,0.0,0.0,0.0,0.174683,0.535645,0.0,0.0,0.0,0.160645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222534,0.0,0.0,0.059906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,75,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.536621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.288574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.328857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183228,0.0,0.0,0.197144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.384766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.271973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.410645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.240601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
gs.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  3.1min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=1000,
        n_jobs=6, num_leaves=31, objective=None, random_state=314,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'num_leaves': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1049727b8>, 'min_child_samples': <scipy.stats._distn_infrastructure.rv_frozen object at 0x104972358>, 'min_child_weight': [1e-05, 0.001, 0.01, 0.1, 1, 10.0, 100.0, 1000.0, 10000.0], 'subsample': <scipy.stats...., 'objective': ['multiclass'], 'metric': ['multi_error'], 'learning_rate': [0.025], 'min_data': [1]},
          pre_dispatch='2*n_jobs', random_state=314, refit=Tru

In [78]:
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

Best score reached: 0.7401894451962111 with params: {'bagging_fraction': 0.75, 'boosting_type': 'gbdt', 'colsample_bytree': 0.9827253740513708, 'feature_fraction': 0.95, 'learning_rate': 0.025, 'metric': 'multi_error', 'min_child_samples': 142, 'min_child_weight': 1e-05, 'min_data': 1, 'num_class': 6, 'num_leaves': 13, 'objective': 'multiclass', 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 0.9642224534031838} 


In [80]:
report_dict= {}
## spell takes very long, be cautious when running 

parama = params
for i in [ 240]:
    report_dict["stem_lem_"+str(i)] = test_preprocessing(df,gs.best_params_,n_feat=i, stem_lem = True)

### See lest detailed output in the next codeblock.

                                             documents
164  govern payment institut and regist payment ser...
882                        27 deal by employ of dealer
846  direct 2006 43 ec of the european parliament a...
824                                          murabahah
481  royal decr regul on electron payment servic of...
(739, 1) (494, 1)


0it [00:00, ?it/s]

Preprocessing text...


1it [00:00,  2.66it/s]


Done.
6 6
(739, 242) (494, 242)
Fold 1/3
Training until validation scores don't improve for 100 rounds.
[100]	training's multi_error: 0.130081	valid_1's multi_error: 0.279352
Early stopping, best iteration is:
[14]	training's multi_error: 0.164634	valid_1's multi_error: 0.263158
Important features:
('documents_len', 50)
('documents_tfidf_119', 44)
('documents_tfidf_167', 33)
('documents_tfidf_25', 32)
('documents_wc', 30)
('documents_tfidf_43', 29)
('documents_tfidf_204', 28)
('documents_tfidf_65', 27)
('documents_tfidf_117', 26)
('documents_tfidf_58', 25)
('documents_tfidf_159', 24)
('documents_tfidf_154', 24)
('documents_tfidf_227', 23)
('documents_tfidf_104', 22)
('documents_tfidf_202', 21)
('documents_tfidf_51', 19)
('documents_tfidf_32', 19)
('documents_tfidf_223', 16)
('documents_tfidf_143', 16)
('documents_tfidf_98', 16)
('documents_tfidf_79', 15)
('documents_tfidf_218', 14)
('documents_tfidf_183', 14)
('documents_tfidf_72', 14)
('documents_tfidf_66', 14)
('documents_tfidf_21', 

  if diff:
  if diff:


In [81]:
### For this dataset a simple stem and lem performs the best. 

for key in report_dict.keys():
    print(key,":")
    print("_________________________")
    print(report_dict[key].iloc[-1,:])
    print("=========================")

stem_lem_240 :
_________________________
Precision      0.772534
Recall         0.736842
F1-score       0.727613
Support      247.000000
Name: Avg/Total, dtype: float64


In [82]:
## Full Analysis  
for key in report_dict.keys():
    print(key,":")
    print("_________________________")
    print(report_dict[key])
    print("=========================")

stem_lem_240 :
_________________________
            Precision    Recall  F1-score  Support
banking      0.659420  0.910000  0.764706    100.0
companies    0.875000  0.388889  0.538462     36.0
funds        0.739130  0.680000  0.708333     25.0
insurance    1.000000  0.727273  0.842105     44.0
payment      0.727273  0.500000  0.592593     16.0
securities   0.740741  0.769231  0.754717     26.0
Avg/Total    0.772534  0.736842  0.727613    247.0
