# Detection of Large-Language Model (LLM) Generated Text

# Импорт библиотек

In [None]:
import pandas as pd
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns


# Загружаем и предобрабртываем данные

In [None]:
data=pd.read_csv('data/train_v2_drcat_02.csv')

In [None]:
# Удаляем дупликаты и нулевые значения
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

# Векторизуем текст

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Инициализация TF-IDF векторизатора
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Вы можете настроить max_features по мере необходимости

# Токенизация и создание TF-IDF векторов для столбца 'text' объединенного набора данных
tfidf_vectors = tfidf_vectorizer.fit_transform(data['text'])

# Преобразование TF-IDF векторов в DataFrame для удобного анализа
tfidf_df = pd.DataFrame(tfidf_vectors.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Отображение DataFrame с TF-IDF
print(tfidf_df.head())

# Делим датасет на тренировочные и тестовые данные

In [None]:
X = tfidf_df.values 
y = data['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Кодируем

In [None]:
# Кодирование целевых меток (0 и 1) с использованием LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Простая нейронная сеть для классификации текста

In [None]:
# Создаем модель для нейронной сети
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])


# Обучение

In [None]:
history = model.fit(X_train, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_test, y_test_encoded), verbose=1)

In [None]:
# Точность на тестовых данных
test_loss, test_accuracy = model.evaluate(X_test, y_test_encoded)
print(f'Test Accuracy: {test_accuracy:.4f}')

# Сохраняем модель

In [None]:
import pickle

with open('tfidf_tokenizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

model.save('text_classification_model.h5')


# Usage Code

In [None]:
import pickle
from tensorflow.keras.models import load_model

# Загружаем токенизатор
with open('tfidf_tokenizer.pkl', 'rb') as f:
    tfidf_tokenizer = pickle.load(f)

# Загружаем модель
loaded_model = load_model('text_classification_model.h5')

# Функция предсказания 
def predict_outcome(text):
    # Предобработка текста с использованием загруженного токенизатора
    text_features = tfidf_tokenizer.transform([text])
    
    # Преобразование текстовых признаков в плотный формат numpy массива
    text_features_dense = text_features.toarray()

    predictions = loaded_model.predict(text_features_dense)
    return predictions[0][0]

In [None]:
input_text = '''
Some people think that the only way to learn foreign languages is to start very early. Others believe that foreign languages can be learned at any age. In this essay I will address pluses and mines of learning foreign languages at an early age.
 On the one hand, since 4 years children talk with their parents very well. And if in family all speak on 2 languages, child will can speak on 2 languages too. It is the reason, because some people think that studying many languages is simply only at an early age. When I was younger my family often travel to different countries, when people speak in only English. And I learning to speak English to them. It was easy for me.
 On the other hand, many people move to another country. For them they must to learn another language. And they do it. That is an example of how people can learn another language at any age. I think, if people a person really wants to learn something, he can do it.
 In conclusion, somebody think that peoples can learn foreign languages at only an early age, but another think it can be done an any age. I think that everybody is right. All people can to learn foreign languages, but maybe it is easier to do it an early age.
'''


predicted_label = predict_outcome(input_text)
print("Predicted Label:", "{:.20f}".format(predicted_label))

# Делаем предсказание на тестовых данных

In [None]:
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
test

In [None]:
test['generated'] = test['text'].apply(predict_outcome)
submission = test[['id', 'generated']]
submission

In [None]:
submission.to_csv('submission.csv', index=False)