In [50]:
import fasttext
import numpy as np
import pandas as pd
import matplotlib.pyplot as  plt
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import seaborn as sea
from sklearn.neighbors import KNeighborsClassifier
import catboost as cat
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split


def quality_metrics(marks, preds):
    """
    Функция выводит значения основных показателей качества обучения модели
    """
    print("Accuracy: {0:.9f}".format(accuracy_score(marks, preds)))
    print("Precision: {0:.9f}".format(precision_score(marks, preds, average='micro')))
    print("Recall: {0:.9f}".format(recall_score(marks, preds, average='micro')))
    print("F1: {0:.9f}".format(f1_score(marks, preds, average='micro')))

def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

In [51]:
dataset = pd.read_csv('/content/selection_main (1).csv', delimiter=',')

In [52]:
dataset = dataset.sample(frac=1)

In [53]:
dataset

Unnamed: 0.1,Unnamed: 0,url,title,tags_names,abstract,class
3360,3360,https://arxiv.org/abs/2007.13926,Intelligent Optimization of Diversified Commun...,Neural and Evolutionary Computing; Artificial ...,Traditional Chinese medicine (TCM) has played ...,Fuzzy
3903,3903,https://arxiv.org/abs/1610.06490,An Ensemble of Adaptive Neuro-Fuzzy Kohonen Ne...,Artificial Intelligence,A new approach to data stream clustering with ...,Fuzzy
7711,7711,https://arxiv.org/abs/2208.14197,A Comprehensive Review of Digital Twin -- Part...,"Computational Engineering, Finance, and Scienc...",As an emerging technology in the era of Indust...,Control Systems
4790,4790,https://arxiv.org/abs/2205.05070,Tensor-based Collaborative Filtering With Smoo...,Information Retrieval; Machine Learning,Conventional collaborative filtering technique...,"Information Retrieval, Recommender Systems"
5797,5797,https://arxiv.org/abs/2202.09110,Iterative Learning for Instance Segmentation,Computer Vision and Pattern Recognition,Instance segmentation is a computer vision tas...,Computer Vision
...,...,...,...,...,...,...
2838,2838,https://arxiv.org/abs/2203.13036,Extending MAPE-K to support Human-Machine Teaming,Robotics; Software Engineering,The MAPE-K feedback loop has been established ...,Robotic
6777,6777,https://arxiv.org/abs/2110.11848,Clustering Market Regimes using the Wasserstei...,Computational Finance; Machine Learning; Mathe...,The problem of rapid and automated detection o...,Text Mining
8561,8561,https://arxiv.org/abs/2205.01355,Predicting Loose-Fitting Garment Deformations ...,Graphics; Computer Vision and Pattern Recognit...,We present a learning algorithm that uses bone...,Database
5853,5853,https://arxiv.org/abs/2202.12165,Transformers in Medical Image Analysis: A Review,Computer Vision and Pattern Recognition,Transformers have dominated the field of natur...,Computer Vision


In [54]:
training, test = dataset.iloc[:7000], dataset.iloc[7000:]

In [55]:
training['class'].value_counts()

Information Retrieval, Recommender Systems    721
Neural Nets                                   720
Cyber Security                                715
Database                                      703
Fuzzy                                         700
Expert                                        698
Text Mining                                   692
Computer Vision                               689
Control Systems                               681
Robotic                                       681
Name: class, dtype: int64

In [56]:
test['class'].value_counts()


Control Systems                               319
Robotic                                       319
Computer Vision                               311
Text Mining                                   308
Expert                                        302
Fuzzy                                         300
Database                                      297
Cyber Security                                285
Neural Nets                                   280
Information Retrieval, Recommender Systems    279
Name: class, dtype: int64

In [57]:
import nltk
nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
tokenizer = WordPunctTokenizer()
training['title'] = [tokenizer.tokenize(x.lower()) for x in training['title']]
training['abstract'] = [tokenizer.tokenize(x.lower()) for x in training['abstract']]
training['tags_names'] = [tokenizer.tokenize(x.lower()) for x in training['tags_names']]
test['title'] = [tokenizer.tokenize(x.lower()) for x in test['title']]
test['abstract'] = [tokenizer.tokenize(x.lower()) for x in test['abstract']]
test['tags_names'] = [tokenizer.tokenize(x.lower()) for x in test['tags_names']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training['title'] = [tokenizer.tokenize(x.lower()) for x in training['title']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training['abstract'] = [tokenizer.tokenize(x.lower()) for x in training['abstract']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training['tags_names'] = [tokenizer.token

In [59]:
for i in ['title', 'abstract', 'tags_names']:
    training[i] = [[x for x in y if x not in stopwords] for y in training[i]]
    test[i] = [[x for x in y if x not in stopwords] for y in test[i]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training[i] = [[x for x in y if x not in stopwords] for y in training[i]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[i] = [[x for x in y if x not in stopwords] for y in test[i]]


In [60]:
training = training.drop(columns=['url', 'Unnamed: 0'])
test = test.drop(columns=['url', 'Unnamed: 0'])

In [61]:
training['text'] = training['title'] + training['tags_names'] + training['abstract']
test['text'] = test['title'] + test['tags_names'] + test['abstract']

In [62]:
training = training.drop(columns=['title', 'tags_names', 'abstract'])
test = test.drop(columns=['title', 'tags_names', 'abstract'])

In [63]:
training = training[['text', 'class']]
test = test[['text', 'class']]

In [64]:
lem = WordNetLemmatizer()
training['text'] = [[lem.lemmatize(x) for x in i] for i in training['text']]
training['text'] = [[x for x in i if x.isalpha()] for i in training['text']]

In [67]:
test['text'] = [[lem.lemmatize(x) for x in i] for i in test['text']]
test['text'] = [[x for x in i if x.isalpha()] for i in test['text']]

In [68]:
test

Unnamed: 0,text,class
6693,"[social, fraud, detection, review, method, cha...",Text Mining
7746,"[motion, planning, control, multi, vehicle, au...",Control Systems
8422,"[nosql, security, data, driven, decision, maki...",Database
9433,"[phishing, attack, website, classification, us...",Cyber Security
9200,"[fixed, point, cyber, space, rethinking, optim...",Cyber Security
...,...,...
2838,"[extending, mape, k, support, human, machine, ...",Robotic
6777,"[clustering, market, regime, using, wasserstei...",Text Mining
8561,"[predicting, loose, fitting, garment, deformat...",Database
5853,"[transformer, medical, image, analysis, review...",Computer Vision


In [69]:
clses = ['Computer Vision', 'Control Systems', 'Cyber Security', 'Database', 'Expert', 'Fuzzy', "Information Retrieval, Recommender Systems", 'Neural Nets', 'Robotic', 'Text Mining']
dic_lbls = {clses[i]:i  for i in range(10)}
training = training.replace(dic_lbls)
training['labels'] = training['class']
training = training.drop(columns=['class'])
dic_lbls

{'Computer Vision': 0,
 'Control Systems': 1,
 'Cyber Security': 2,
 'Database': 3,
 'Expert': 4,
 'Fuzzy': 5,
 'Information Retrieval, Recommender Systems': 6,
 'Neural Nets': 7,
 'Robotic': 8,
 'Text Mining': 9}

In [70]:
training

Unnamed: 0,text,labels
3360,"[intelligent, optimization, diversified, commu...",5
3903,"[ensemble, adaptive, neuro, fuzzy, kohonen, ne...",5
7711,"[comprehensive, review, digital, twin, part, m...",1
4790,"[tensor, based, collaborative, filtering, smoo...",6
5797,"[iterative, learning, instance, segmentation, ...",0
...,...,...
3066,"[robust, fuzzy, q, learning, based, strictly, ...",5
742,"[attack, deidentification, defense, cryptograp...",4
2870,"[transferable, legged, mobile, manipulation, f...",8
8856,"[vldb, designing, hybrid, conference, database...",3


In [30]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [71]:
test['text'] = [[lem.lemmatize(x) for x in i] for i in test['text']]
test['text'] = [[x for x in i if x.isalpha()] for i in test['text']]
test = test.replace(dic_lbls)
test['labels'] = test['class']
test = test.drop(columns=['class'])

In [77]:
test

Unnamed: 0,text,labels
6693,"[social, fraud, detection, review, method, cha...",9
7746,"[motion, planning, control, multi, vehicle, au...",1
8422,"[nosql, security, data, driven, decision, maki...",3
9433,"[phishing, attack, website, classification, us...",2
9200,"[fixed, point, cyber, space, rethinking, optim...",2
...,...,...
2838,"[extending, mape, k, support, human, machine, ...",8
6777,"[clustering, market, regime, using, wasserstei...",9
8561,"[predicting, loose, fitting, garment, deformat...",3
5853,"[transformer, medical, image, analysis, review...",0


In [76]:
training

Unnamed: 0,text,labels
3360,"[intelligent, optimization, diversified, commu...",5
3903,"[ensemble, adaptive, neuro, fuzzy, kohonen, ne...",5
7711,"[comprehensive, review, digital, twin, part, m...",1
4790,"[tensor, based, collaborative, filtering, smoo...",6
5797,"[iterative, learning, instance, segmentation, ...",0
...,...,...
3066,"[robust, fuzzy, q, learning, based, strictly, ...",5
742,"[attack, deidentification, defense, cryptograp...",4
2870,"[transferable, legged, mobile, manipulation, f...",8
8856,"[vldb, designing, hybrid, conference, database...",3


In [80]:
training['text'] = [[lem.lemmatize(x) for x in i] for i in training['text']]
training['text'] = [[x for x in i if x.isalpha()] for i in training['text']]
training = training.replace(dic_lbls)
training['labels'] = training['labels']


In [82]:
training

Unnamed: 0,text,labels
3360,"[intelligent, optimization, diversified, commu...",5
3903,"[ensemble, adaptive, neuro, fuzzy, kohonen, ne...",5
7711,"[comprehensive, review, digital, twin, part, m...",1
4790,"[tensor, based, collaborative, filtering, smoo...",6
5797,"[iterative, learning, instance, segmentation, ...",0
...,...,...
3066,"[robust, fuzzy, q, learning, based, strictly, ...",5
742,"[attack, deidentification, defense, cryptograp...",4
2870,"[transferable, legged, mobile, manipulation, f...",8
8856,"[vldb, designing, hybrid, conference, database...",3


In [83]:
df = pd.concat([training, test])
df['text'] = df['text'].agg(' '.join)

In [84]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['labels'], test_size=0.3, stratify=df['labels'], random_state=123
)

In [85]:
corpus = X_train.values

FAST TEXT

In [88]:
with open('train.txt', 'w') as f:
    for each_text, each_label in zip(X_train.values, y_train):
        f.writelines(f'__label__{each_label} {each_text}\n')

with open('test.txt', 'w') as f:
    for each_text, each_label in zip(X_test.values, y_test):
        f.writelines(f'__label__{each_label} {each_text}\n')

In [90]:
!head -n 10 test.txt

__label__9 autohensgnn winning solution autograph challenge kdd cup machine learning artificial intelligence graph neural network gnns become increasingly popular achieved impressive result many graph based application however extensive manual work domain knowledge required design effective architecture result gnn model high variance different training setup limit application existing gnn model paper present autohensgnn framework build effective robust model graph task without human intervention autohensgnn first place autograph challenge kdd cup achieved best rank score five real life datasets final phase given task autohensgnn first applies fast proxy evaluation automatically select pool promising gnn model build hierarchical ensemble framework propose graph self ensemble gse reduce variance weight initialization efficiently exploit information local global neighborhood based gse weighted ensemble different type gnn model used effectively learn discriminative node representation effi

In [98]:
model1 = fasttext.train_supervised('train.txt')

def print_results(sample_size, precision, recall):
    precision   = round(precision, 2)
    recall      = round(recall, 2)
    F1 = 2 * (precision * recall) / (precision + recall)
    print(f'{sample_size=}')
    print(f'{precision=}')
    print(f'{recall=}')
    print(f'{F1=}')

# Применяем функцию
print_results(*model1.test('test.txt'))

sample_size=3000
precision=0.53
recall=0.53
F1=0.53


In [99]:
model2 = fasttext.train_supervised('train.txt', epoch=25)

print_results(*model2.test('test.txt'))

sample_size=3000
precision=0.82
recall=0.82
F1=0.82


In [106]:
model3 = fasttext.train_supervised('train.txt', epoch=50, lr=0.1)

print_results(*model3.test('test.txt'))

sample_size=3000
precision=0.82
recall=0.82
F1=0.82


In [107]:
model4 = fasttext.train_supervised('train.txt', epoch=50, lr=0.1, wordNgrams =2)

print_results(*model4.test('test.txt'))

sample_size=3000
precision=0.82
recall=0.82
F1=0.82


In [108]:
model5 = fasttext.train_supervised('train.txt', autotuneValidationFile='test.txt')

print_results(*model5.test('test.txt'))

sample_size=3000
precision=0.83
recall=0.83
F1=0.83


In [109]:
model6 = fasttext.train_supervised('train.txt', autotuneValidationFile='test.txt', autotuneMetric="f1:__label__1")

print_results(*model6.test('test.txt'))

sample_size=3000
precision=0.83
recall=0.83
F1=0.83


In [1]:
import os
from tensorflow import keras
from keras import layers
from keras.models import Sequential
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from keras.datasets import mnist
from sklearn.model_selection import train_test_split

In [2]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()


In [3]:
# создание своего разбиения датасета
# объединяем в один набор
X = np.concatenate((X_train, X_test))
y = np.concatenate((y_train, y_test))

# разбиваем по вариантам
X_train, X_test, y_train, y_test = train_test_split(X, y,
 test_size = 10000,
 train_size = 60000,
 random_state = 4*9 - 1)

# вывод размерностей
print('Shape of X train:', X_train.shape)
print('Shape of y train:', y_train.shape)
print('Shape of X test:', X_test.shape)
print('Shape of y test:', y_test.shape)

Shape of X train: (60000, 28, 28)
Shape of y train: (60000,)
Shape of X test: (10000, 28, 28)
Shape of y test: (10000,)


In [4]:
# Зададим параметры данных и модели
num_classes = 10
input_shape = (28, 28, 1)

# Приведение входных данных к диапазону [0, 1]
X_train = X_train / 255
X_test = X_test / 255

# Расширяем размерность входных данных, чтобы каждое изображение имело
# размерность (высота, ширина, количество каналов)
X_train = np.expand_dims(X_train, -1)
X_test = np.expand_dims(X_test, -1)
print('Shape of transformed X train:', X_train.shape)
print('Shape of transformed X test:', X_test.shape)

# переведем метки в one-hot
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('Shape of transformed y train:', y_train.shape)
print('Shape of transformed y test:', y_test.shape)


Shape of transformed X train: (60000, 28, 28, 1)
Shape of transformed X test: (10000, 28, 28, 1)
Shape of transformed y train: (60000, 10)
Shape of transformed y test: (10000, 10)


In [5]:
# создаем модель
model = keras.Sequential()
model.add(keras.layers.Conv2D(32, kernel_size=(3, 3), activation="relu", input_shape=input_shape))
model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(keras.layers.Conv2D(64, kernel_size=(3, 3), activation="relu"))
model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(num_classes, activation="softmax"))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 26, 26, 32)        320       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 13, 13, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 11, 11, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 5, 5, 64)         0         
 2D)                                                             
                                                                 
 dropout (Dropout)           (None, 5, 5, 64)          0         
                                                                 
 flatten (Flatten)           (None, 1600)              0

In [None]:
# компилируем и обучаем модель
batch_size = 64
epochs = 15
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

# Оценка качества работы модели на тестовых данных
scores = model.evaluate(X_test, y_test)
print('Loss on test data:', scores[0])
print('Accuracy on test data:', scores[1])

Epoch 1/15
