# Работа с датасетами 

In [None]:
from datasets import load_dataset, Dataset, concatenate_datasets

# Загрузка датасетов
real_news_full = load_dataset('IlyaGusev/ru_news', split="train", streaming=True, trust_remote_code=True)
fake_news_full = load_dataset("its5Q/panorama", split="train", streaming=True, trust_remote_code=True)

real_news = real_news_full.take(10000)
fake_news = fake_news_full.take(10000)

# Добавление меток
real_news = real_news.map(lambda x: {'text': x['text'], "label": 1}, remove_columns=[col for col in real_news.features if col != "text"])
fake_news = fake_news.map(lambda x: {'text': x['body'], "label": 0}, remove_columns=[col for col in fake_news.features if col != "text"])

# Объединение датасетов
combined_dataset = concatenate_datasets([real_news, fake_news])


In [None]:
import pandas as pd
df = pd.DataFrame(combined_dataset)
df.head()

In [None]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('all_news_data.csv', index=False, encoding='utf-8')

In [None]:
# Загрузка данных
df = pd.read_csv('all_news_data.csv', encoding='utf-8')
df.head()

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds 

# Создание набора данных
target = df.pop('label')
df['text'] = df['text'].astype(str)  

ds_raw = tf.data.Dataset.from_tensor_slices((df.values, target.values))


Разделение данных на выборки

In [None]:
tf.random.set_seed(1)
ds_raw = ds_raw.shuffle(20000, reshuffle_each_iteration=False)

ds_raw_test = ds_raw.take(10000)
ds_raw_train_valid = ds_raw.skip(10000)
ds_raw_train = ds_raw_train_valid.take(6000)
ds_raw_valid = ds_raw_train_valid.skip(6000)

# Предобработка данных

In [None]:
import nltk
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
import pymorphy2 

nltk.download('stopwords')
nltk.download('punkt')
stop_words = (stopwords.words('russian'))+ ['который', 'это', 'наш', 'свой', 'также', 'всё', 'весь']  
stop_words = {word.lower() for word in stop_words}

morph = pymorphy2.MorphAnalyzer()

all_text = ""

for example in ds_raw_train:
    text = example[0].numpy()[0].decode('utf-8') 
    all_text += " " + text  

tokens = wordpunct_tokenize(all_text.lower())  
lemmatized_tokens = []
for word in tokens:
    if word.isalpha():  
        lemma = morph.parse(word)[0].normal_form  
        if lemma not in stop_words:  
            lemmatized_tokens.append(lemma) 

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf

all_texts = [example[0].numpy()[0].decode('utf-8') for example in ds_raw_train]  


VOCAB_SIZE = 30000  
OOV_TOKEN = "<UNK>" 

tokenizer = Tokenizer(
    num_words=VOCAB_SIZE,
    oov_token=OOV_TOKEN,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'  
)
tokenizer.fit_on_texts(all_texts)

In [None]:
def encode(text_tensor, label):
    text = text_tensor.numpy()[0].decode('utf-8')  # Декодируем текст
    encoded_text = tokenizer.texts_to_sequences([text])[0]  # Преобразуем в индексы
    return encoded_text, label

In [None]:
def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

In [None]:
# Кодирование наборов в целые числа
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

In [None]:
# Деление всех трех наборов данных на мини-пакеты с размером пакета 32
train_data = ds_train.padded_batch(32, padded_shapes=([-1], []))
valid_data = ds_valid.padded_batch(32, padded_shapes=([-1], []))
test_data = ds_test.padded_batch(32, padded_shapes=([-1], []))

# Построение модели на основе RNN

In [None]:
embedding_dim = 20
vocab_size = len(token_counts) + 2
tf.random.set_seed(1)

bi_lstm_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(32,)),
    tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        name='embed-layer'
    ),
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, name='lstm-layer'),
        name='bidir-lstm'
    ),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
bi_lstm_model.summary()

In [None]:
# Компиляция и обучение
bi_lstm_model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy']
)
history = bi_lstm_model.fit(train_data,validation_data=valid_data,
                            epochs=7)
# Оценка на тестовых данных
test_results = bi_lstm_model.evaluate(test_data)
print('Результат на тестовых: {:.2f}%'.format(test_results[1]*100))