#Importing datasets

In [3]:
# https://drive.google.com/file/d/1pmNSD1nbYHEAiP065s4akRXHMWFs9Dqw/view?usp=sharing DBpedia train.csv
# https://drive.google.com/file/d/1mKededzdbJsWQnwsu-R_WSILYSvNEY7c/view?usp=sharing DBpedia test.csv
!pip install gdown
!gdown --id 1pmNSD1nbYHEAiP065s4akRXHMWFs9Dqw --output train.csv  #import train.csv from drive
!gdown --id 1mKededzdbJsWQnwsu-R_WSILYSvNEY7c --output test.csv   #import test.csv from drive

Downloading...
From: https://drive.google.com/uc?id=1pmNSD1nbYHEAiP065s4akRXHMWFs9Dqw
To: /content/train.csv
100% 174M/174M [00:02<00:00, 67.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1mKededzdbJsWQnwsu-R_WSILYSvNEY7c
To: /content/test.csv
100% 21.8M/21.8M [00:00<00:00, 45.6MB/s]


In [4]:
import numpy as np
import pandas as pd

train_data = pd.read_csv('train.csv', encoding='utf8',header=None) #read csv to dataframe
test_data = pd.read_csv('test.csv', encoding='utf8',header=None)   #read csv to dataframe

train_data.where(train_data[0] < 6, inplace = True)  #select first 5 categories
train_data = train_data[train_data[0].notnull()]     #remove NaN values

test_data.where(test_data[0] < 6, inplace = True)    #select first 5 categories
test_data = test_data[test_data[0].notnull()]        #remove NaN values

In [5]:
train_data.sample(5) #sample from train data

Unnamed: 0,0,1,2
110354,3.0,Håkon Gebhardt,Håkon Gebhardt (born 21 June 1969 in Tromsø N...
165735,5.0,Canuto E. Celestino,Canuto E. Celestino (died 1903) was an Filipi...
110891,3.0,Ulla Pirttijärvi,Ulla Pirttijärvi is a Sami joik singer from t...
44258,2.0,The King's School Pontefract,The King's School is a coeducational secondar...
24181,1.0,IMK 14. oktobar Kruševac,Industrija mašina i komponenata - IMK 14. okt...


In [6]:
test_data.sample(5) #sample from test data

Unnamed: 0,0,1,2
24666,5.0,Edward J. Smith (American politician),Edward Smith was a member of the Rhode Island...
10902,3.0,I. Dragoslav,I. Dragoslav or Ion Dragoslav pen names of Io...
14153,3.0,Eliphalet Frazer Andrews,Eliphalet Frazer Andrews (11 June 1835 - 15 M...
12123,3.0,Utku Dalmaz,Utku Dalmaz (born September 9 1985) is a Turk...
1657,1.0,Brooks & Doxey,Brook & Doxey was a textile machinery manufac...


In [7]:
train_label = pd.to_numeric(train_data.iloc[:,0]) #select labels (int) from train data
train_text = train_data.iloc[:,1:3] #select article text from train data

In [8]:
test_label = pd.to_numeric(test_data.iloc[:,0]) #select labels (int) from test data
test_text = test_data.iloc[:,1:3] #select article text from train data

#Pre-processing

In [9]:
from sklearn.feature_extraction.text import CountVectorizer #vectorizer for article text data
from nltk.corpus import stopwords #stopwords for desktop usage
import nltk
nltk.download('stopwords') #stopwords for collab notebook

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
vectorizer = CountVectorizer(stop_words=stopwords.words('english'), analyzer='word', ngram_range=(1, 1)) #vectorizer for words, removing stopwords

In [11]:
test_title = test_text.iloc[:,0] #select titles from test text
test_desc = test_text.iloc[:,1]  #select descriptions from test text

In [12]:
train_title = train_text.iloc[:,0] #select titles from train text
train_desc = train_text.iloc[:,1]  #select descriptions from train text

In [13]:
titles = pd.concat([test_title, train_title]) #concat titles for vectorazition
descs = pd.concat([test_desc, train_desc]) #concat descriptions for vectorazition

In [14]:
titles = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(titles)) #vectorizing test_title and adding it to a dataframe
title_feature_names = np.asarray(vectorizer.get_feature_names_out()) #getting all feature names for test_title
descs = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(descs)) #vectorizing test_desc and adding it to a dataframe
desc_feature_names = np.asarray(vectorizer.get_feature_names_out()) #getting all feature names for test_desc

In [15]:
titles = titles.loc[:, titles.sum(axis=0) > 8]

In [16]:
titles.shape

(225000, 7177)

In [18]:
#descs = descs.loc[:, descs.sum(axis=0) > 8]

In [19]:
#descs.shape

In [17]:
test_title = titles[0:len(test_title.index)]
test_title.shape

(25000, 7177)

In [18]:
#First 5 rows and feature names obtained with the vectoritzer
print(test_title[0:5])
print(title_feature_names[0:5])

   5       23      38      212     ...  127570  127653  127803  128028
0       0       0       0       0  ...       0       0       0       0
1       0       0       0       0  ...       0       0       0       0
2       0       0       0       0  ...       0       0       0       0
3       0       0       0       0  ...       0       0       0       0
4       0       0       0       0  ...       0       0       0       0

[5 rows x 7177 columns]
['002' '05' '07' '09' '0verflow']


In [22]:
#test_desc = descs.head(len(test_desc.index))

In [23]:
#test_desc.shape

In [24]:
#First 5 rows and feature names obtained with the vectorizer
#print(test_desc[0:5])

In [19]:
train_title = titles.tail(len(titles) - len(test_title))

In [20]:
train_title.shape

(200000, 7177)

In [21]:
#First 5 rows and feature names obtained with the vectoritzer
print(train_title[0:5])

       5       23      38      212     ...  127570  127653  127803  128028
25000       0       0       0       0  ...       0       0       0       0
25001       0       0       0       0  ...       0       0       0       0
25002       0       0       0       0  ...       0       0       0       0
25003       0       0       0       0  ...       0       0       0       0
25004       0       0       0       0  ...       0       0       0       0

[5 rows x 7177 columns]


In [28]:
#train_desc = descs.tail(len(descs) - len(test_desc.index))

In [29]:
#train_desc.shape

In [30]:
#First 5 rows and feature names obtained with the vectoritzer
#print(train_desc[0:5])

In [22]:
from scipy.sparse import csr_matrix #for min-max scaling sparse matrices

In [23]:
def normalize(df): #function for min-max scaling of dataframes
    result = df.copy()
    for feature_name in df.columns:
        max_value = csr_matrix(df[feature_name]).max()
        min_value = csr_matrix(df[feature_name]).min()
        result[feature_name] = (df[feature_name] - min_value) / max((max_value - min_value), 1)
    return result

In [24]:
test_title = normalize(test_title) #min-max scale test_title

In [25]:
print(test_title[0:5])

   5       23      38      212     ...  127570  127653  127803  128028
0     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
1     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
2     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
3     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
4     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0

[5 rows x 7177 columns]


In [35]:
#test_desc = normalize(test_desc) #min-max scale test_desc

In [36]:
#print(test_desc[0:5])

In [26]:
train_title = normalize(train_title) #min-max scale train_title

In [27]:
print(train_title[0:5])

       5       23      38      212     ...  127570  127653  127803  128028
25000     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
25001     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
25002     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
25003     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
25004     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0

[5 rows x 7177 columns]


In [40]:
#train_desc = normalize(train_desc) #min-max scale train_desc

In [41]:
#print(train_desc[0:5])

In [136]:
input_shape = train_title.iloc[0].shape
input_shape = (input_shape[0], 1)
input_shape

(7177, 1)

In [137]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Flatten, Conv1D, MaxPooling1D, Dropout
from tensorflow.keras.optimizers import SGD
import matplotlib.pyplot as plt

In [162]:
model = Sequential()
model.add(Conv1D(32, 3, activation='relu', input_shape=input_shape))
model.add(MaxPooling1D(pool_size=1))
model.add(Conv1D(32, 3, activation='relu'))
model.add(MaxPooling1D(pool_size=1))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [163]:
print(model.summary())

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_23 (Conv1D)          (None, 7175, 32)          128       
                                                                 
 max_pooling1d_18 (MaxPoolin  (None, 7175, 32)         0         
 g1D)                                                            
                                                                 
 conv1d_24 (Conv1D)          (None, 7173, 32)          3104      
                                                                 
 max_pooling1d_19 (MaxPoolin  (None, 7173, 32)         0         
 g1D)                                                            
                                                                 
 flatten_9 (Flatten)         (None, 229536)            0         
                                                                 
 dense_18 (Dense)            (None, 16)              

In [164]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
patience=10
early_stopping=EarlyStopping(patience=patience, verbose=1)
checkpointer=ModelCheckpoint(filepath='model.hdf5', save_best_only=True, verbose=1)
tb = TensorBoard(log_dir='logs', histogram_freq=1, write_graph=1)

In [169]:
from tensorflow.keras.utils import Sequence, to_categorical

class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size=128, dim=(7177), n_channels=1,
                 n_classes=5, shuffle=True):
        #Initialization
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        #Denotes the number of batches per epoch
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        #Generate one batch of data
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # Find list of IDs
        #list_IDs_temp = [self.list_IDs.iloc[k] for k in indexes]
        # Generate data
        X, y = self.__data_generation(indexes)

        return X, y

    def on_epoch_end(self):
        #Updates indexes after each epoch
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        #Generates data containing batch_size samples
        # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)
        # Generate data
        for i in range(len(list_IDs_temp)):
            # Store sample
            X[i] = self.list_IDs.iloc[i].to_numpy().reshape(self.dim, self.n_channels)

            # Store class
            y[i] = self.labels.iloc[i]

        return X, to_categorical(y, num_classes=self.n_classes)

In [170]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_generator = DataGenerator(train_title, train_label)

In [None]:
network_history = model.fit(train_generator, epochs=30, verbose=1, callbacks=[early_stopping, checkpointer, tb])

Epoch 1/30

In [None]:
from keras.models import load_model
model = load_model("model.hdf5")
test_err = model.evaluate(test_title, test_label)
print("Teszt hiba:", test_err[0], "Teszt pontosság:", test_err[1])

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, confusion_matrix
# először is nyerjük ki a predikciókat (valószínüség és hozzá tartozó pontosságot)
y_pred = model.predict(test_title)
y_pred = np.argmax(y_pred,1)
y_true = np.argmax(y_test,1)

print("test accuracy: %g" %(accuracy_score(y_true, y_pred)))
print("Precision", precision_score(y_true, y_pred, average="macro"))
print("Recall", recall_score(y_true, y_pred, average="macro"))
print("f1_score", f1_score(y_true, y_pred, average="macro"))
print("\nKonfúziós mátrix: ")
conf=confusion_matrix(y_true, y_pred)
print(conf)

In [None]:
import seaborn as sns
sns.heatmap(conf, annot=True, fmt='d', vmax=20) # a vmax paraméterrel állítjuk be, hogy milyen értéktartományban jelenítse meg az adatokat

Our **X** will be the sparse matrices and the feature names from the title and the description, and **Y** will be the labels provided in the csv file.

Validation split is obtained later in the model.fit() function from training data