In [1]:
import pandas as pd
pd.set_option('display.max_row', 500)
pd.set_option('display.max_columns', 100)
import numpy as np
from IPython.display import display
from tqdm import tqdm
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.preprocessing import sequence, text
from xgboost import XGBClassifier 
# import nltk
# nltk.download('stopwords')
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
stop_words = stopwords.words('english')


In [2]:
df = pd.read_csv('./spooky_data/train.csv')
print('Length of Data : ', len(df))
display(df.head())
display(df.author.value_counts().reset_index())

Length of Data :  19579


Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


Unnamed: 0,index,author
0,EAP,7900
1,MWS,6044
2,HPL,5635


### ***TEXT Pattern***
- 파생변수 생성

In [3]:
def pattern_extract(df, stopwords=stop_words):
    
    df_re = df.copy()
    
    ## Number of words in the text : 단어 수 ##
    df_re["num_words"] = df_re["text"].apply(lambda x: len(str(x).split()))

    ## Number of unique words in the text : 유니크 단어 수 ##
    df_re["num_unique_words"] = df_re["text"].apply(lambda x: len(set(str(x).split())))

    ## Number of characters in the text : 문자 수 ##
    df_re["num_chars"] = df_re["text"].apply(lambda x: len(str(x)))

    ## Number of stopwords in the text : 불용어 수 ##
    df_re["num_stopwords"] = df_re["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))

    ## Number of punctuations in the text : 특수문자 수 ##
    df_re["num_punctuations"] =df_re['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

    ## Number of title case words in the text : 대문자 수 ##
    df_re["num_words_upper"] = df_re["text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

    ## Number of title case words in the text : 단어의 첫글자가 대문자인 단어 수 ##
    df_re["num_words_title"] = df_re["text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

    ## Average length of the words in the text : 평균 문자 수 ##
    df_re["avg_word_len"] = df_re["text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    
    ## Average length of the words(except stop words) in the text : 불용어가 아닌 단어의 평균 문자 수 ##
    df_re['avg_word_len_not_stopword'] = df_re['text'].apply(lambda x: np.mean([len(t) for t in x.split(' ') if t not in stopwords]))
    
    ## Number of comma
    df_re['num_commas'] = df_re['text'].apply(lambda x: x.count(','))
    
    return df_re

df_re = pattern_extract(df)
display(df_re.head(5))

Unnamed: 0,id,text,author,num_words,num_unique_words,num_chars,num_stopwords,num_punctuations,num_words_upper,num_words_title,avg_word_len,avg_word_len_not_stopword,num_commas
0,id26305,"This process, however, afforded me no means of...",EAP,41,35,231,19,7,2,3,4.658537,6.0,4
1,id17569,It never once occurred to me that the fumbling...,HPL,14,14,71,8,1,0,1,4.142857,5.714286,0
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,36,32,200,16,5,0,1,4.583333,5.952381,4
3,id27763,How lovely is spring As we looked from Windsor...,MWS,34,32,206,13,4,0,4,5.088235,6.304348,3
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,27,25,174,11,4,0,2,5.481481,7.4375,2


In [23]:
df_re.groupby('author').agg(['mean','median','std'])

Unnamed: 0_level_0,num_words,num_words,num_words,num_unique_words,num_unique_words,num_unique_words,num_chars,num_chars,num_chars,num_stopwords,num_stopwords,num_stopwords,num_punctuations,num_punctuations,num_punctuations,num_words_upper,num_words_upper,num_words_upper,num_words_title,num_words_title,num_words_title,avg_word_len,avg_word_len,avg_word_len,avg_word_len_not_stopword,avg_word_len_not_stopword,avg_word_len_not_stopword,num_commas,num_commas,num_commas
Unnamed: 0_level_1,mean,median,std,mean,median,std,mean,median,std,mean,median,std,mean,median,std,mean,median,std,mean,median,std,mean,median,std,mean,median,std,mean,median,std
author,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2
EAP,25.442405,21.0,18.567706,21.894937,19.0,13.727397,142.225949,115.0,105.751334,12.626456,10.0,9.546129,4.096329,3.0,3.573788,0.553291,0.0,0.892966,2.102405,1.0,2.052241,4.644952,4.6,0.63134,6.033637,6.036376,0.944184,2.227089,2.0,2.445522
HPL,27.799645,26.0,14.123252,24.437977,23.0,11.053739,155.843478,142.0,82.020647,12.970186,12.0,6.853415,3.206921,3.0,2.108637,0.500266,0.0,0.852313,2.334694,2.0,2.041579,4.625193,4.6,0.554917,5.889127,5.909091,0.873821,1.522804,1.0,1.345381
MWS,27.417273,23.0,23.13444,23.544672,21.0,14.925835,151.659828,130.0,126.305008,13.742224,12.0,12.080172,3.833719,3.0,2.840625,0.751489,0.0,1.203636,2.124255,2.0,1.759572,4.598182,4.560791,0.561558,5.947698,5.947368,0.860704,1.992886,2.0,2.100672


In [4]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota


def XGBoostModel(xtrain, xtest, ytrain, ytest, opt=False):
    if opt: 
        xgb = XGBClassifier()
        param_grid = {
                "n_estimators":[100,200,300],
                "max_depth":[4, 10, 20]
            }

        xgb_tuned = GridSearchCV(estimator=xgb,
                                param_grid=param_grid,
                                cv=3, 
                                n_jobs=-1,
                                verbose=0)

        xgb_tuned.fit(X=xtrain,
                    y=ytrain)
        
        print('XGB MODEL Best score : {:.2f}%'.format(xgb_tuned.best_score_*100))

        xgb_model = xgb_tuned.best_estimator_
        
    else:
        xgb_model = XGBClassifier()
        xgb_model.fit(X=xtrain,
                y=ytrain)
        
    ypred = xgb_model.predict(xtest)
    ypred_p = xgb_model.predict_proba(xtest)
    
    print('ACCURACY SCORE : ',accuracy_score(ytest.values.reshape(-1), ypred))
    print ("LOG LOSS: %0.3f " % multiclass_logloss(ytest.values.reshape(-1), ypred_p))
    
    return xgb_model


# Sample Modeling
df_re['target'] = LabelEncoder().fit_transform(df_re['author'])
x_train, x_test, y_train, y_test = train_test_split(df_re.drop(['id','author','target'], axis=1), df_re[['target']], test_size=0.3, random_state=123, shuffle=True)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

xgb = XGBoostModel(x_train.drop('text', axis=1), x_test.drop('text', axis=1), y_train, y_test)

(13705, 11) (13705, 1) (5874, 11) (5874, 1)
ACCURACY SCORE :  0.5187265917602997
LOG LOSS: 0.979 


-  ***A Deep Dive Into Sklearn Pipelines***
> https://www.kaggle.com/code/baghern/a-deep-dive-into-sklearn-pipelines

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Transformer to select a single column from the data frame to perform additional transformations on
class TextSelector(BaseEstimator, TransformerMixin):
    """
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    

text = Pipeline([
                ('selector', TextSelector(key='text')),
                ('tfidf', TfidfVectorizer(stop_words='english'))
            ])

text.fit_transform(X_train)

### ***TEXT Preprocessing***

> 텍스트 전처리
1. Tokenization : word tokenization
2. Cleaning : remove noise (stopword)
3. Normalization : data integration (stemming, lemmatization)

### ***TEXT Conversion***
---
1. TF-IDF
2. SVD
3. Word2Vec
---


1. TF-IDF
    1) TF-IDF TEXT VECTOR
    2) TF-IDF TEXT VECTOR + TEXT INFO

In [19]:
# 1. TF-IDF Parameter
# 1-1 TF-IDF Modeling
import re
def text_processing(tab):
    for i in range(tab.shape[0]):
        # remove special VW symbols
        text = tab["text"].loc[i].strip().replace('|', '').replace(':', '').lower() 
        words = re.findall("\w{3,}", text) 
        new_text = " ".join(words) 
        tab.loc[i, 'new_text'] = new_text
    return tab

def tfidf(min_df=3, max_features=None, ngram=3):
    tfv = TfidfVectorizer(min_df=min_df, # 최소 빈도값 설정 : 3개 이상의 문서에 출현한 단어만 사용
                        max_features=max_features, # unique 단어 수 제한
                        strip_accents='unicode', # 문자 정규화 {'ascii', 'unicode', None}
                        analyzer='word', # {'word' : 학습 단위 = 단어, 'char' : 학습 단위 = 글자}
                        token_pattern=r'\w{1,}',
                        ngram_range=(1, ngram), # 단어 묶음 설정 -> if (1, 3) : 'go', 'go back', 'go back to'
                        use_idf=True,
                        smooth_idf=True,
                        sublinear_tf=True, # TF값의 smoothing 여부 -> if smoothin True : TF -> 1 + ln(TF)
                        stop_words = 'english') # 불용어 제거 {'english', list : 사용자 설정, None}
    return tfv

# df_re = text_processing(df_re)
x_train, x_test, y_train, y_test = train_test_split(df_re.drop(['id','author','target','text'], axis=1), df_re[['target']], test_size=0.3, random_state=123, shuffle=True)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

tfv = tfidf()
tfv.fit(df_re.new_text.values)

xtrain_tfv =  tfv.transform(x_train.new_text.values) 
xvalid_tfv = tfv.transform(x_test.new_text.values)
print(xtrain_tfv.toarray().shape, xvalid_tfv.toarray().shape)

xgb = XGBoostModel(xtrain_tfv.tocsc(), xvalid_tfv.tocsc(), y_train, y_test)

(13705, 11) (13705, 1) (5874, 11) (5874, 1)
(13705, 14604) (5874, 14604)
ACCURACY SCORE :  0.672454885938032
LOG LOSS: 0.774 


In [6]:
# # 1-2 TF-IDF + Feature Modeling
# tfv = tfidf(min_df=5, max_features=100, ngram=2)
# tfv.fit(df_re.text.values)

# xtrain_tfv =  tfv.transform(x_train.text.values) 
# xvalid_tfv = tfv.transform(x_test.text.values)
# print(xtrain_tfv.toarray().shape, xvalid_tfv.toarray().shape)

# x_train_re = pd.concat([pd.DataFrame(xtrain_tfv.toarray()), x_train.drop('text', axis=1).reset_index(drop=True)], axis=1)
# x_test_re = pd.concat([pd.DataFrame(xvalid_tfv.toarray()), x_test.drop('text', axis=1).reset_index(drop=True)], axis=1)
# print(x_train_re.shape, x_test_re.shape)

# xgb = XGBoostModel(x_train_re, x_test_re, y_train, y_test)

(13705, 100) (5874, 100)
(13705, 110) (5874, 110)
ACCURACY SCORE :  0.5888661899897855
LOG LOSS: 0.881 


In [52]:
# 1-3 
from sklearn import ensemble, metrics, model_selection, naive_bayes
import xgboost as xgb
def runXGB(train_X, train_y, test_X, test_y=None, seed_val=0, child=1, colsample=0.3, verbose_eval=100):
    param = {'objective':'multi:softprob',
             'eta':0.1,
             'max_depth':3,
             'silent':1,
             'num_class':3,
             'eval_metric':'mlogloss',
             'min_child_weight':child,
             'subsample':0.8,
             'colsample_bytree':colsample,
             'seed':seed_val}

    num_rounds = 2000

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=verbose_eval)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest, ntree_limit = model.best_ntree_limit)
    # if test_X2 is not None:
    #     xgtest2 = xgb.DMatrix(test_X2)
    #     pred_test_y2 = model.predict(xgtest2, ntree_limit = model.best_ntree_limit)
    return pred_test_y, model


# test
pred_test_y, model = runXGB(x_train.drop('new_text', axis=1), y_train['target'],
                            x_test.drop('new_text', axis=1), y_test['target'], seed_val=0, child=1, colsample=0.3)


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-mlogloss:1.09257	test-mlogloss:1.09271




[100]	train-mlogloss:0.95783	test-mlogloss:0.97700
[200]	train-mlogloss:0.92573	test-mlogloss:0.96237
[300]	train-mlogloss:0.90443	test-mlogloss:0.95772
[400]	train-mlogloss:0.88823	test-mlogloss:0.95611
[500]	train-mlogloss:0.87493	test-mlogloss:0.95614
[522]	train-mlogloss:0.87209	test-mlogloss:0.95637




In [53]:
# TF-IDF
tfv = tfidf()
tfv.fit(df_re.new_text.values.tolist())
xtrain_tfv =  tfv.transform(x_train.new_text.values.tolist()) 
xtest_tfv = tfv.transform(x_test.new_text.values.tolist())

pred_test_y, model = runXGB(xtrain_tfv, y_train['target'],
                            xtest_tfv, y_test['target'], seed_val=0, child=1, colsample=0.3, verbose_eval=300)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-mlogloss:1.09270	test-mlogloss:1.09272




[300]	train-mlogloss:0.78036	test-mlogloss:0.84007
[600]	train-mlogloss:0.67268	test-mlogloss:0.76910
[900]	train-mlogloss:0.60080	test-mlogloss:0.72761
[1200]	train-mlogloss:0.54690	test-mlogloss:0.70075
[1500]	train-mlogloss:0.50372	test-mlogloss:0.68127
[1800]	train-mlogloss:0.46784	test-mlogloss:0.66751
[1999]	train-mlogloss:0.44719	test-mlogloss:0.65998




2. SVD

In [54]:
from sklearn.decomposition import TruncatedSVD
tfv = tfidf()
tfv.fit(df_re.new_text.values)

xtrain_tfv =  tfv.transform(x_train.new_text.values) 
xvalid_tfv = tfv.transform(x_test.new_text.values)
print(xtrain_tfv.toarray().shape, xvalid_tfv.toarray().shape)

svd = TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# # Scale the data obtained from SVD. Renaming variable to reuse without scaling.
# scl = StandardScaler()
# scl.fit(xtrain_svd)
# xtrain_svd_scl = scl.transform(xtrain_svd)
# xvalid_svd_scl = scl.transform(xvalid_svd)

pred_test_y, model = runXGB(xtrain_svd, y_train['target'],
                            xvalid_svd, y_test['target'], seed_val=0, child=1, colsample=0.3, verbose_eval=300)

(13705, 14604) (5874, 14604)
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-mlogloss:1.08521	test-mlogloss:1.08637




[300]	train-mlogloss:0.60832	test-mlogloss:0.77876
[600]	train-mlogloss:0.49041	test-mlogloss:0.75930
[900]	train-mlogloss:0.40461	test-mlogloss:0.75322
[935]	train-mlogloss:0.39610	test-mlogloss:0.75312




3. Word2Vec