<a href="https://colab.research.google.com/github/cyberknopa/ML-in-IS/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import re
import itertools
from collections import Counter


def clean_str(string):
    """
    Tokenization/string cleaning for datasets.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [6]:
def load_data_and_labels():
    """
    Loads polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(open("news-1.csv", "r", encoding='UTF-8').readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open("Fake.csv", "r", encoding='UTF-8').readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    x_text = [s.split(" ") for s in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]


def pad_sentences(sentences, padding_word="<PAD/>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences


def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    vocabulary_inv = list(sorted(vocabulary_inv))
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]


def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentences and labels to vectors based on a vocabulary.
    """
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]


def load_data():
    """
    Loads and preprocessed data for the dataset.
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """
    # Load and preprocess data
    sentences, labels = load_data_and_labels()
    sentences_padded = pad_sentences(sentences)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, labels, vocabulary)
    return [x, y, vocabulary, vocabulary_inv]

In [7]:
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model
from sklearn.model_selection import train_test_split

In [8]:
print('Loading data')
x, y, vocabulary, vocabulary_inv = load_data()

# x.shape -> (10662, 56)
# y.shape -> (10662, 2)
# len(vocabulary) -> 18765
# len(vocabulary_inv) -> 18765

X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42)

# X_train.shape -> (8529, 56)
# y_train.shape -> (8529, 2)
# X_test.shape -> (2133, 56)
# y_test.shape -> (2133, 2)

Loading data


In [9]:
sequence_length = x.shape[1] # 56
vocabulary_size = len(vocabulary_inv) # 18765
embedding_dim = 256
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5

epochs = 10
batch_size = 30

In [10]:
print("Creating Model...")
inputs = Input(shape=(sequence_length,), dtype='int32')
embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=sequence_length)(inputs)
reshape = Reshape((sequence_length,embedding_dim,1))(embedding)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=2, activation='softmax')(dropout)

Creating Model...


In [11]:
print(load_data())

[array([[2215, 2749,    5, ..., 1207, 1207, 1207],
       [2215, 2749,    5, ..., 1207, 1207, 1207],
       [2215, 2749,    5, ..., 1207, 1207, 1207],
       ...,
       [   0, 1207, 1207, ..., 1207, 1207, 1207],
       [   5,    5,  465, ..., 1207, 1207, 1207],
       [   5, 1545,    5, ..., 1207, 1207, 1207]]), array([[0, 1],
       [0, 1],
       [0, 1],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]]), {'': 0, '!': 1, "'": 2, "''": 3, "'s": 4, ',': 5, '0': 6, '00': 7, '000': 8, '003': 9, '0060': 10, '008': 11, '01': 12, '012': 13, '014': 14, '01t00': 15, '01t13': 16, '01t14': 17, '01t15': 18, '01t16': 19, '01t17': 20, '01t18': 21, '01t19': 22, '01t20': 23, '01t21': 24, '01t22': 25, '01t23': 26, '02': 27, '02t14': 28, '02t15': 29, '02t16': 30, '02t17': 31, '02t18': 32, '02t19': 33, '02t20': 34, '02t21': 35, '02t22': 36, '02t23': 37, '03': 38, '03t00': 39, '03t15': 40, '03t16': 41, '03t17': 42, '03t18': 43, '03t19': 44, '03t20': 45, '03t21': 46, '03t22': 47, '03t23': 48, '04

In [12]:
# this creates a model that includes
model = Model(inputs=inputs, outputs=output)

checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_accuracy:.4f}.hdf5', monitor='val_accuracy', verbose=1, save_best_only=True, mode='auto')
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
print("Traning Model...")
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[checkpoint], validation_data=(X_test, y_test))  # starts training

Traning Model...
Epoch 1/10


  super(Adam, self).__init__(name, **kwargs)


Epoch 1: val_accuracy improved from -inf to 1.00000, saving model to weights.001-1.0000.hdf5
Epoch 2/10
Epoch 2: val_accuracy did not improve from 1.00000
Epoch 3/10
Epoch 3: val_accuracy did not improve from 1.00000
Epoch 4/10
Epoch 4: val_accuracy did not improve from 1.00000
Epoch 5/10
Epoch 5: val_accuracy did not improve from 1.00000
Epoch 6/10
Epoch 6: val_accuracy did not improve from 1.00000
Epoch 7/10
Epoch 7: val_accuracy did not improve from 1.00000
Epoch 8/10
Epoch 8: val_accuracy did not improve from 1.00000
Epoch 9/10
Epoch 9: val_accuracy did not improve from 1.00000
Epoch 10/10
Epoch 10: val_accuracy did not improve from 1.00000


<keras.callbacks.History at 0x7fc416b46dd0>

In [13]:
sample_text=('Экс-канцлер Австрии Себастьян Курц предложил странам Европейского союза (ЕС) сесть за стол переговоров с Россией для обсуждения вопроса по Украине. Его слова передает Bild am Sonntag.  По словам политика, европейским государствам необходимо мирное решение конфликта в том числе и для обеспечения собственной безопасности.  «Все справедливо хотят, чтобы Украина победила. Проигрыш — не вариант для [президента РФ Владимира] Путина. Ввиду ядерной угрозы Европейский союз должен настаивать на пути переговоров. Самое главное сейчас — остановить кровопролитие и найти мирное решение за столом переговоров, чтобы не допустить тотальной эскалации на нашем континенте», — заявил Курц.  Ранее бывший глава правительства Австрии рассказал о Путине в мемуарах.')
