In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
import language_tool_python
##  StratifiedKFold、cross_val_scoreをインポート
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score


In [2]:
test = pd.read_csv('../input/test_essays.csv')
sub = pd.read_csv('../input/sample_submission.csv')
external_train = pd.read_csv("../input/train_essays.csv")
external_train.rename(columns={"generated": "label"}, inplace=True)

train = pd.read_csv("../input/train_v2_drcat_02.csv")
train = pd.concat([train, external_train])

train = train.drop_duplicates(subset=["text"])

train.reset_index(drop=True).sample(frac=1)




Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven,id,prompt_id
41656,The author effectively supports the idea that...,1,Exploring Venus,cohere-command,True,,
24773,"When you are seeking for advice, why do you of...",0,Seeking multiple opinions,persuade_corpus,False,,
33686,The pursuit of progress and the constant purs...,1,Summer projects,mistral7binstruct_v2,False,,
11472,"I do think this could be helpful in many ways,...",0,Facial action coding system,persuade_corpus,True,,
9731,"""The Challenge of exploring venus"" it talks ab...",0,Exploring Venus,persuade_corpus,True,,
...,...,...,...,...,...,...,...
18818,"Dear State Senator,\n\nAfter much research, I ...",0,Does the electoral college work?,persuade_corpus,True,,
22111,Dear Principal\n\nYou should take Policy 1 bec...,0,Cell phones at school,persuade_corpus,False,,
685,Phones and Driving\n\nEveryday people die beca...,0,Phones and driving,persuade_corpus,False,,
13810,"Dear Principal,\n\nThere has been a lot of con...",0,Community service,persuade_corpus,False,,


In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matsuisouta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/matsuisouta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

# ストップワードのロード
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # HTMLタグの削除
    text = re.sub(r'<.*?>', '', text)
    # 句読点、数字、特殊記号の削除
    text = re.sub(r'[^\w\s]', '', text)
    # 小文字化
    text = text.lower()
    return text

# カスタムトークナイザー
def tokenize(text):
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words]

def calculate_length(data):
    # pandas.Seriesの各要素に対して長さを計算
    return [[len(text.split())] for text in data]


# テキストデータのクリーニング
train['clean_text'] = train['text'].apply(clean_text)
test['clean_text'] = test['text'].apply(clean_text)

In [16]:
print(train.columns)
print(test.columns)


Index(['text', 'label', 'prompt_name', 'source', 'RDizzl3_seven', 'id',
       'prompt_id', 'clean_text', 'text_length'],
      dtype='object')
Index(['id', 'prompt_id', 'text', 'clean_text', 'text_length'], dtype='object')


In [20]:
# 特徴量を組み合わせるためのTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


# 数値特徴量用のスケーラー
scaler = StandardScaler()

# TF-IDFベクトル化器を作成
tfidf = TfidfVectorizer(
    ngram_range=(3, 5),
    tokenizer=tokenize,
    strip_accents="unicode"
    max_features=10000
)

# 文の長さの特徴量変換器
length_transformer = Pipeline([
    ('length', FunctionTransformer(calculate_length, validate=False)),
    ('scale', StandardScaler())
])

# ColumnTransformerの設定
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', tfidf, 'clean_text'),  # 'clean_text' 列に TF-IDF ベクトル化を適用
        ('length', length_transformer, 'text')  # 'text' 列に文の長さ計算を適用
    ]
)

# 特徴量変換の実行
train_x = preprocessor.fit_transform(train)
test_x = preprocessor.transform(test)



In [21]:
print(train_x.shape)
print(test_x.shape)

(44868, 19803183)
(3, 19803183)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB

lr = LogisticRegression()
clf = MultinomialNB(alpha=0.02)
sgd_model = SGDClassifier(max_iter=5000, tol=1e-3, loss="modified_huber")   
sgd_model2 = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber", class_weight="balanced") 
sgd_model3 = SGDClassifier(max_iter=10000, tol=5e-4, loss="modified_huber", early_stopping=True)

ensemble = VotingClassifier(
    estimators=[
        ("lr", lr),
        ("mnb", clf),
        ("sgd", sgd_model),
        ("sgd2", sgd_model2),
        ("sgd3", sgd_model3),
    ],
    voting="soft",
)



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# データをトレーニングセットとテストセットに分割
train_data, test_data, train_labels, test_labels = train_test_split(train['clean_text'], train.label, test_size=0.2, random_state=42)

## tfidfをfit
train_data = tfidf.fit_transform(train_data)
test_data = tfidf.transform(test_data)


# モデルをトレーニング
ensemble.fit(train_data, train_labels)

# テストセットでモデルを評価
predicted_probs = ensemble.predict_proba(test_data)[:, 1]

# AUCを計算
auc = roc_auc_score(test_labels, predicted_probs )
print(f'AUC: {auc}')


AUC: 0.998313252772911


In [None]:
ensemble.fit(train_x, train.label)

In [None]:
test["generated"] = ensemble.predict_proba(test_x)[:, 1]


In [None]:
#kaggleの提出用ファイルを作成
test[["id", "generated"]].to_csv("submission.csv", index=False)
