#Importing datasets

In [None]:
# https://drive.google.com/file/d/1pmNSD1nbYHEAiP065s4akRXHMWFs9Dqw/view?usp=sharing DBpedia train.csv
# https://drive.google.com/file/d/1mKededzdbJsWQnwsu-R_WSILYSvNEY7c/view?usp=sharing DBpedia test.csv
!pip install gdown 
!gdown --id 1pmNSD1nbYHEAiP065s4akRXHMWFs9Dqw --output train.csv  #import train.csv from drive
!gdown --id 1mKededzdbJsWQnwsu-R_WSILYSvNEY7c --output test.csv   #import test.csv from drive

Downloading...
From: https://drive.google.com/uc?id=1pmNSD1nbYHEAiP065s4akRXHMWFs9Dqw
To: /content/train.csv
100% 174M/174M [00:01<00:00, 161MB/s]
Downloading...
From: https://drive.google.com/uc?id=1mKededzdbJsWQnwsu-R_WSILYSvNEY7c
To: /content/test.csv
100% 21.8M/21.8M [00:00<00:00, 133MB/s]


In [None]:
import numpy as np
import pandas as pd

train_data = pd.read_csv('train.csv', encoding='utf8',header=None) #read csv to dataframe
test_data = pd.read_csv('test.csv', encoding='utf8',header=None)   #read csv to dataframe

train_data.where(train_data[0] < 6, inplace = True)  #select first 5 categories
train_data = train_data[train_data[0].notnull()]     #remove NaN values

test_data.where(test_data[0] < 6, inplace = True)    #select first 5 categories
test_data = test_data[test_data[0].notnull()]        #remove NaN values

In [None]:
train_data.sample(5) #sample from train data

Unnamed: 0,0,1,2
109465,3.0,Felix Schlag,Felix Oscar Schlag (September 4 1891 – March ...
184911,5.0,Raymond Pryor,Raymond A. Pryor is a former Democratic membe...
169570,5.0,Joe Purcell,Joe Edward Purcell (July 29 1923 – March 5 19...
185117,5.0,Abdulaziz bin Mohieddin Khoja,Abdulaziz bin Mohieddin Khoja (born 1940) is ...
20073,1.0,9 Story Entertainment,9 Story Entertainment is an animation studio ...


In [None]:
test_data.sample(5) #sample from test data

Unnamed: 0,0,1,2
17522,4.0,Gary Neibauer,Gary Wayne Neibauer (born October 29 1944) is...
2797,1.0,Commercial Bank Chad,Commercial Bank Chad also spelled as Commerc...
21747,5.0,Oliver H. Prince,Oliver Hillhouse Prince (1787 – October 9 183...
19412,4.0,Alf Amos,Herbert Amos (9 February 1893 - 9 March 1964)...
3235,1.0,Siemens Mobile,Siemens Mobile was a mobile phone manufacture...


In [None]:
train_label = pd.to_numeric(train_data.iloc[:,0]) #select labels (int) from train data
train_text = train_data.iloc[:,1:3] #select article text from train data

In [None]:
test_label = pd.to_numeric(test_data.iloc[:,0]) #select labels (int) from test data
test_text = test_data.iloc[:,1:3] #select article text from train data

#Pre-processing

In [None]:
from sklearn.feature_extraction.text import CountVectorizer #vectorizer for article text data
from nltk.corpus import stopwords #stopwords for desktop usage
import nltk
nltk.download('stopwords') #stopwords for collab notebook

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
vectorizer = CountVectorizer(stop_words=stopwords.words('english'), analyzer='word', ngram_range=(1, 1)) #vectorizer for words, removing stopwords

In [None]:
test_title = test_text.iloc[:,0] #select titles from test text
test_desc = test_text.iloc[:,1]  #select descriptions from test text

In [None]:
train_title = train_text.iloc[:,0] #select titles from train text
train_desc = train_text.iloc[:,1]  #select descriptions from train text

In [None]:
titles = pd.concat([test_title, train_title]) #concat titles for vectorazition
descs = pd.concat([test_desc, train_desc]) #concat descriptions for vectorazition

In [None]:
titles = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(titles)) #vectorizing test_title and adding it to a dataframe
title_feature_names = np.asarray(vectorizer.get_feature_names_out()) #getting all feature names for test_title
descs = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(descs)) #vectorizing test_desc and adding it to a dataframe
desc_feature_names = np.asarray(vectorizer.get_feature_names_out()) #getting all feature names for test_desc

In [None]:
titles = titles.loc[:, titles.sum(axis=0) > 8]

In [None]:
titles.shape

(225000, 7177)

In [None]:
descs = descs.loc[:, descs.sum(axis=0) > 8]

In [None]:
descs.shape

(225000, 37127)

In [None]:
test_title = titles[0:len(test_title.index)]
test_title.shape

(25000, 7177)

In [None]:
#First 5 rows and feature names obtained with the vectoritzer
print(test_title[0:5])
print(title_feature_names[0:5])

   5       23      38      212     ...  127570  127653  127803  128028
0       0       0       0       0  ...       0       0       0       0
1       0       0       0       0  ...       0       0       0       0
2       0       0       0       0  ...       0       0       0       0
3       0       0       0       0  ...       0       0       0       0
4       0       0       0       0  ...       0       0       0       0

[5 rows x 7177 columns]
['002' '05' '07' '09' '0verflow']


In [None]:
test_desc = descs.head(len(test_desc.index))

In [None]:
test_desc.shape

(25000, 37127)

In [None]:
#First 5 rows and feature names obtained with the vectorizer
print(test_desc[0:5])

   0       1       81      83      ...  315123  315294  315478  315503
0       0       0       0       0  ...       0       0       0       0
1       0       0       0       0  ...       0       0       0       0
2       0       0       0       0  ...       0       0       0       0
3       0       0       0       0  ...       0       0       0       0
4       0       0       0       0  ...       0       0       0       0

[5 rows x 37127 columns]


In [None]:
train_title = titles.tail(len(titles) - len(test_title))

In [None]:
train_title.shape

(200000, 7177)

In [None]:
#First 5 rows and feature names obtained with the vectoritzer
print(train_title[0:5])

       5       23      38      212     ...  127570  127653  127803  128028
25000       0       0       0       0  ...       0       0       0       0
25001       0       0       0       0  ...       0       0       0       0
25002       0       0       0       0  ...       0       0       0       0
25003       0       0       0       0  ...       0       0       0       0
25004       0       0       0       0  ...       0       0       0       0

[5 rows x 7177 columns]


In [None]:
train_desc = descs.tail(len(descs) - len(test_desc.index))

In [None]:
train_desc.shape

(200000, 37127)

In [None]:
#First 5 rows and feature names obtained with the vectoritzer
print(train_desc[0:5])

       0       1       81      83      ...  315123  315294  315478  315503
25000       0       0       0       0  ...       0       0       0       0
25001       0       0       0       0  ...       0       0       0       0
25002       0       0       0       0  ...       0       0       0       0
25003       0       0       0       0  ...       0       0       0       0
25004       0       0       0       0  ...       0       0       0       0

[5 rows x 37127 columns]


In [None]:
from scipy.sparse import csr_matrix #for min-max scaling sparse matrices

In [None]:
def normalize(df): #function for min-max scaling of dataframes
    result = df.copy()
    for feature_name in df.columns:
        max_value = csr_matrix(df[feature_name]).max()
        min_value = csr_matrix(df[feature_name]).min()
        result[feature_name] = (df[feature_name] - min_value) / max((max_value - min_value), 1)
    return result

In [None]:
test_title = normalize(test_title) #min-max scale test_title

In [None]:
print(test_title[0:5])

   5       23      38      212     ...  127570  127653  127803  128028
0     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
1     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
2     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
3     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
4     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0

[5 rows x 7177 columns]


In [None]:
test_desc = normalize(test_desc) #min-max scale test_desc

In [None]:
print(test_desc[0:5])

   0       1       81      83      ...  315123  315294  315478  315503
0     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
1     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
2     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
3     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
4     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0

[5 rows x 37127 columns]


In [None]:
train_title = normalize(train_title) #min-max scale train_title

In [None]:
print(train_title[0:5])

       5       23      38      212     ...  127570  127653  127803  128028
25000     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
25001     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
25002     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
25003     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
25004     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0

[5 rows x 7177 columns]


In [None]:
train_desc = normalize(train_desc) #min-max scale train_desc

In [None]:
print(train_desc[0:5])

       0       1       81      83      ...  315123  315294  315478  315503
25000     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
25001     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
25002     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
25003     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
25004     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0

[5 rows x 37127 columns]


In [None]:
  input_shape = train_title.head(1).shape

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Flatten, Conv1D, MaxPooling1D, Dropout
from tensorflow.keras.optimizers import SGD
import matplotlib.pyplot as plt

In [None]:
model = Sequential()
model.add(Conv1D(32, 1, activation='relu', input_shape=input_shape))
model.add(MaxPooling1D(pool_size=1))
model.add(Conv1D(64, 1, activation='relu'))
model.add(MaxPooling1D(pool_size=1))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))
model.compile(loss='mse', optimizer='adam', metrics=['mae'])

In [None]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_2 (Conv1D)           (None, 1, 32)             229696    
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 1, 32)            0         
 1D)                                                             
                                                                 
 conv1d_3 (Conv1D)           (None, 1, 64)             2112      
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 1, 64)            0         
 1D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 128)              

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
patience=10
early_stopping=EarlyStopping(patience=patience, verbose=1)
checkpointer=ModelCheckpoint(filepath='model.hdf5', save_best_only=True, verbose=1)
tb = TensorBoard(log_dir='logs', histogram_freq=1, write_graph=1)

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

#datagen = ImageDataGenerator()
#train_generator = datagen.flow(train_title.head(2000), train_label.head(2000), batch_size=64)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

network_history = model.fit_generator(generator=train_generator, epochs=30, verbose=1, validation_split=0.2, callbacks=[early_stopping, checkpointer, tb])

In [None]:
from keras.models import load_model
model = load_model("model.hdf5")
test_err = model.evaluate(test_title, test_label)
print("Teszt hiba:", test_err[0], "Teszt pontosság:", test_err[1])

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, confusion_matrix
# először is nyerjük ki a predikciókat (valószínüség és hozzá tartozó pontosságot)
y_pred = model.predict(test_title)
y_pred = np.argmax(y_pred,1)
y_true = np.argmax(y_test,1)

print("test accuracy: %g" %(accuracy_score(y_true, y_pred)))
print("Precision", precision_score(y_true, y_pred, average="macro"))
print("Recall", recall_score(y_true, y_pred, average="macro"))
print("f1_score", f1_score(y_true, y_pred, average="macro"))
print("\nKonfúziós mátrix: ")
conf=confusion_matrix(y_true, y_pred)
print(conf)

In [None]:
import seaborn as sns
sns.heatmap(conf, annot=True, fmt='d', vmax=20) # a vmax paraméterrel állítjuk be, hogy milyen értéktartományban jelenítse meg az adatokat

Our **X** will be the sparse matrices and the feature names from the title and the description, and **Y** will be the labels provided in the csv file.

Validation split is obtained later in the model.fit() function from training data