In [98]:
import pandas as pd
import warnings
warnings.filterwarnings(action = 'ignore')
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re

In [99]:
# 파일 불러오기
train = pd.read_csv('/content/drive/My Drive/Semi-project_01/open/train.csv', encoding = 'utf-8')
test = pd.read_csv('/content/drive/My Drive/Semi-project_01/open/test_x.csv', encoding = 'utf-8')
sample_submission = pd.read_csv('/content/drive/My Drive/Semi-project_01/open/sample_submission.csv', encoding = 'utf-8')

In [100]:
# 부호를 제거해주는 함수
def alpha_num(text) :
  return re.sub(r'[^A-Za-z0-9 ]', ' ', text)

train['text'] = train['text'].apply(alpha_num)

In [101]:
# 이상치 제거
train = train[train.text != "     "]
train = train.reset_index(drop=True)
train

Unnamed: 0,index,text,author
0,0,He was almost choking There was so much so m...,3
1,1,Your sister asked for it I suppose,2
2,2,She was engaged one day as she walked in per...,1
3,3,The captain was in the porch keeping himself ...,4
4,4,Have mercy gentlemen odin flung up his han...,3
...,...,...,...
54874,54874,Is that you Mr Smith odin whispered I h...,2
54875,54875,I told my plan to the captain and between us ...,4
54876,54876,Your sincere well wisher friend and sister...,1
54877,54877,Then you wanted me to lend you money,3


In [102]:
import nltk
nltk.download('stopwords')
stopwords=nltk.corpus.stopwords.words('english')
stopwords.append('odin')
stopwords.extend(['could', 'ought', 'would'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [103]:
# 불용어 제거해주는 함수
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

# 불용어
# stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
#              "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
#              "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
#              "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
#              "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
#              "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
#              "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
#              "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
#              "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
#              "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
#              "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [104]:
#전처리 적용
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

In [105]:
# logloss 함수
def multiclass_logloss(actual, predicted, eps=1e-15):

    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [106]:
# from sklearn.feature_extraction.text import CountVectorizer

# vec=CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,3),
#                     stop_words='english')

# vec.fit(list(train['text'])+list(test['text']))

# xtrain_ctv=vec.transform(train['text'])
# xtest_ctv=vec.transform(test['text'])

In [107]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [108]:
scorer=make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)

In [109]:
print("train['text'] max 길이: ",max(len(i) for i in train['text']))
print("test['text'] max 길이: ",max(len(i) for i in test['text']))

print("train['text'] min 길이: ",min(len(i) for i in train['text']))
print("test['text'] min 길이: ",min(len(i) for i in test['text']))

train['text'] max 길이:  1325
test['text'] max 길이:  1380
train['text'] min 길이:  0
test['text'] min 길이:  33


In [110]:
# TF-IDF 모델 생성
tfv=TfidfVectorizer(min_df=3, smooth_idf=1, sublinear_tf=1, use_idf=1)
tfv.fit(list(train['text'])+list(test['text']))
xtrain_tfv=tfv.transform(train['text'])
xtest_tfv=tfv.transform(test['text'])

In [111]:
y=train['author']

In [112]:
X_train, X_test, y_train, y_test = train_test_split(xtrain_tfv, y, 
                                                    test_size=0.3, random_state=2020)

In [113]:
model=MultinomialNB()
model.fit(xtrain_tfv, y)
predictions=model.predict_proba(X_test)

print('TF-IDF, logloss: %0.3f' % multiclass_logloss(y_test, predictions))

TF-IDF, logloss: 0.802


In [114]:
nb_model = MultinomialNB()

# Create the pipeline 
clf = Pipeline([('nb', nb_model)])

# parameter grid
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(xtrain_tfv, y)
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 6 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    1.2s


Best score: -0.782
Best parameters set:
	nb__alpha: 0.01


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    1.4s finished


In [115]:
model=MultinomialNB(alpha=0.01)
model.fit(xtrain_tfv, y)
predictions=model.predict_proba(X_test)

print('TF-IDF & gridsearch, logloss: %0.3f' % multiclass_logloss(y_test, predictions))

TF-IDF & gridsearch, logloss: 0.526


In [116]:
pred=model.predict_proba(xtest_tfv)

In [117]:
pred

array([[1.75074488e-02, 2.34128998e-01, 1.74210783e-01, 5.67755703e-01,
        6.39706728e-03],
       [5.39191211e-02, 5.11502737e-01, 1.80526012e-02, 3.82251443e-01,
        3.42740973e-02],
       [9.60953655e-01, 7.76227287e-04, 1.51865886e-03, 1.45451134e-02,
        2.22063459e-02],
       ...,
       [1.03875190e-02, 9.85619614e-01, 7.54454210e-04, 2.01267230e-03,
        1.22574043e-03],
       [6.13500510e-03, 9.86412119e-01, 2.48742369e-03, 4.88446543e-03,
        8.09865478e-05],
       [9.22965196e-01, 3.08281555e-04, 5.73134271e-03, 2.01404286e-02,
        5.08547509e-02]])

In [118]:
# submission
sample_submission[['0', '1', '2', '3', '4']] = pred
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.017507,0.234129,0.174211,0.567756,0.006397
1,1,0.053919,0.511503,0.018053,0.382251,0.034274
2,2,0.960954,0.000776,0.001519,0.014545,0.022206
3,3,0.003937,0.000001,0.987568,0.004023,0.004470
4,4,0.637587,0.006554,0.134021,0.193042,0.028795
...,...,...,...,...,...,...
19612,19612,0.006740,0.990554,0.000144,0.001077,0.001484
19613,19613,0.608450,0.017057,0.192923,0.024823,0.156746
19614,19614,0.010388,0.985620,0.000754,0.002013,0.001226
19615,19615,0.006135,0.986412,0.002487,0.004884,0.000081


In [119]:
sample_submission.to_csv('/content/drive/My Drive/Semi-project_01/submission_naiveBayes02+stopwords.csv', index = False, encoding = 'utf-8')