In [13]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
import language_tool_python
##  StratifiedKFold、cross_val_scoreをインポート
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score


In [14]:
test = pd.read_csv('../input/test_essays.csv')
sub = pd.read_csv('../input/sample_submission.csv')
org_train = pd.read_csv('../input/train_essays.csv')

train = pd.read_csv("../input/train_v2_drcat_02.csv", sep=',')

train = train.drop_duplicates(subset=["text"])

train.reset_index(drop=True)
x_train = train["text"].values
y_train = train["label"].values



In [15]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = x_train

bag = count.fit_transform(docs)
print(bag.toarray().shape)


(44868, 77326)


In [16]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]


In [17]:
import nltk

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matsuisouta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
from nltk.corpus import stopwords

stop = stopwords.words('english')


In [19]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()

# カスタムトークナイザーとステミングを組み合わせてトークン化する
def my_tokenizer(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

def custom_preprocessor(text):
    # テキストを小文字に変換
    text = text.lower()
    text = text.replace(".", "")
    return text


In [20]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# カスタムトークナイザーを定義
def my_tokenizer(text):
    # トークン化のロジックを記述
    tokens = text.split()  
    return tokens

# TF-IDFベクトル化器を作成
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=custom_preprocessor,
                        tokenizer=my_tokenizer,
                        )

# ロジスティック回帰モデルの作成
lr = LogisticRegression(penalty='l2', C=1.0, random_state=0)

# パイプラインを作成
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', lr)])

# データを使ってモデルを訓練
lr_tfidf.fit(x_train, y_train)




In [21]:
# 層化K分割交差検証を設定
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# データを使って交差検証を実行
scores = cross_val_score(lr_tfidf, x_train, y_train, cv=skf, scoring='roc_auc')

# 各フォールドでのAUCを表示
for fold, score in enumerate(scores, start=1):
    print(f'Fold {fold}: AUC = {score:.4f}')




Fold 1: AUC = 0.9992
Fold 2: AUC = 0.9990
Fold 3: AUC = 0.9988
Fold 4: AUC = 0.9992
Fold 5: AUC = 0.9992


In [22]:
# AUCの平均値を表示
print('Mean AUC = {:.4f}'.format(scores.mean()))


Mean AUC = 0.9991


In [23]:
# テストデータを予測
y_test = lr_tfidf.predict_proba(test['text'])
print(y_test)


[[0.0747683  0.9252317 ]
 [0.07182115 0.92817885]
 [0.07182115 0.92817885]]


In [24]:
# org_trainのデータを予測
y_org_train = lr_tfidf.predict_proba(org_train['text'])
print(y_org_train[:30, 1])


[0.00178684 0.06851632 0.05113438 0.01517225 0.00603028 0.00183517
 0.01222978 0.01222705 0.01238608 0.01105128 0.01311632 0.14083271
 0.0061343  0.00496021 0.002615   0.12489131 0.02201697 0.08991179
 0.00094293 0.00126712 0.01880021 0.25851241 0.00136356 0.04287428
 0.00440583 0.09281496 0.0238726  0.00372949 0.02561396 0.01007964]


In [25]:
#kaggleの提出用ファイルを作成
sub['generated'] = y_test[:, 1]
sub.to_csv('submission.csv', index=False)
