#Importing datasets

In [None]:
# https://drive.google.com/file/d/1pmNSD1nbYHEAiP065s4akRXHMWFs9Dqw/view?usp=sharing DBpedia train.csv
# https://drive.google.com/file/d/1mKededzdbJsWQnwsu-R_WSILYSvNEY7c/view?usp=sharing DBpedia test.csv
!pip install gdown 
!gdown --id 1pmNSD1nbYHEAiP065s4akRXHMWFs9Dqw --output train.csv  #import train.csv from drive
!gdown --id 1mKededzdbJsWQnwsu-R_WSILYSvNEY7c --output test.csv   #import test.csv from drive

Downloading...
From: https://drive.google.com/uc?id=1pmNSD1nbYHEAiP065s4akRXHMWFs9Dqw
To: /content/train.csv
100% 174M/174M [00:01<00:00, 127MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1mKededzdbJsWQnwsu-R_WSILYSvNEY7c
To: /content/test.csv
100% 21.8M/21.8M [00:00<00:00, 133MB/s]


In [1]:
import numpy as np
import pandas as pd

train_data = pd.read_csv('train.csv', encoding='utf8',header=None) #read csv to dataframe
test_data = pd.read_csv('test.csv', encoding='utf8',header=None)   #read csv to dataframe

train_data.where(train_data[0] < 6, inplace = True)  #select first 5 categories
train_data = train_data[train_data[0].notnull()]     #remove NaN values

test_data.where(test_data[0] < 6, inplace = True)    #select first 5 categories
test_data = test_data[test_data[0].notnull()]        #remove NaN values

In [2]:
train_data.sample(5) #sample from train data

Unnamed: 0,0,1,2
32715,1.0,Inmagine,INMAGINE is a supplier and distributor of Roy...
118390,3.0,T. Raumschmiere,Marco Haas (born 1975) is a punk techno DJ kn...
171652,5.0,Percy Bowers,The Venerable Percy Harris Bowers (1856–1922)...
177187,5.0,Benoy Choudhury,Benoy Choudhury (died 2000) was a revolutiona...
99883,3.0,Kutty (cartoonist),Puthukkody Kottuthody Sankaran Kutty Nair (4 ...


In [3]:
test_data.sample(5) #sample from test data

Unnamed: 0,0,1,2
2593,1.0,FLMNE,The Faculté Libre de Médecines Naturelles et ...
12712,3.0,Selden Edwards,Selden Edwards (born 1941) is an American wri...
1801,1.0,Société de transport du Saguenay,Société de transport du Saguenay (STS) is the...
9623,2.0,Webber Independent School,The Webber Independent School (formerly Bury ...
11471,3.0,T. R. Papa,Thiruthuraipoondi Radhakrishnan Pappa (Tamil:...


In [4]:
train_label = pd.to_numeric(train_data.iloc[:,0]) #select labels (int) from train data
train_text = train_data.iloc[:,1:3] #select article text from train data

In [5]:
test_label = pd.to_numeric(test_data.iloc[:,0]) #select labels (int) from test data
test_text = test_data.iloc[:,1:3] #select article text from train data

#Pre-processing

In [6]:
from sklearn.feature_extraction.text import CountVectorizer #vectorizer for article text data
from nltk.corpus import stopwords #stopwords for desktop usage
import nltk
nltk.download('stopwords') #stopwords for collab notebook

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kornel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
vectorizer = CountVectorizer(stop_words=stopwords.words('english'), analyzer='word', ngram_range=(1, 1)) #vectorizer for words, removing stopwords

In [8]:
test_title = test_text.iloc[:,0] #select titles from test text
test_desc = test_text.iloc[:,1]  #select descriptions from test text

In [9]:
train_title = train_text.iloc[:,0] #select titles from train text
train_desc = train_text.iloc[:,1]  #select descriptions from train text

In [10]:
titles = pd.concat([test_title, train_title]) #concat titles for vectorazition
descs = pd.concat([test_desc, train_desc]) #concat descriptions for vectorazition

In [11]:
titles = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(titles)) #vectorizing test_title and adding it to a dataframe
title_feature_names = np.asarray(vectorizer.get_feature_names()) #getting all feature names for test_title
descs = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(descs)) #vectorizing test_desc and adding it to a dataframe
desc_feature_names = np.asarray(vectorizer.get_feature_names()) #getting all feature names for test_desc

In [12]:
test_title = titles[0:len(test_title.index)]

In [13]:
test_title = test_title.loc[:, test_title.sum(axis=0) > 1]

In [14]:
#First 5 rows and feature names obtained with the vectoritzer
print(test_title[0:5])
print(title_feature_names[0:5])

   88      101     177     184     191     206     234     246     267     \
0       0       0       0       0       0       0       0       0       0   
1       0       0       0       0       0       0       0       0       0   
2       0       0       0       0       0       0       0       0       0   
3       0       0       0       0       0       0       0       0       0   
4       0       0       0       0       0       0       0       0       0   

   269     ...  127477  127487  127516  127536  127567  127570  127654  \
0       0  ...       0       0       0       0       0       0       0   
1       0  ...       0       0       0       0       0       0       0   
2       0  ...       0       0       0       0       0       0       0   
3       0  ...       0       0       0       0       0       0       0   
4       0  ...       0       0       0       0       0       0       0   

   127678  127921  128028  
0       0       0       0  
1       0       0       0  
2       

In [15]:
test_desc = descs.head(len(test_desc.index))

In [16]:
test_desc = test_desc.loc[:, test_desc.sum(axis=0) > 1]

In [17]:
#First 5 rows and feature names obtained with the vectorizer
print(test_desc[0:5])

   0       1       56      70      83      84      111     132     147     \
0       0       0       0       0       0       0       0       0       0   
1       0       0       0       0       0       0       0       0       0   
2       0       0       0       0       0       0       0       0       0   
3       0       0       0       0       0       0       0       0       0   
4       0       0       0       0       0       0       0       0       0   

   154     ...  313582  313671  313731  314204  314499  315294  315478  \
0       0  ...       0       0       0       0       0       0       0   
1       0  ...       0       0       0       0       0       0       0   
2       0  ...       0       0       0       0       0       0       0   
3       0  ...       0       0       0       0       0       0       0   
4       0  ...       0       0       0       0       0       0       0   

   315615  315709  315723  
0       0       0       0  
1       0       0       0  
2       

In [18]:
train_title = titles.tail(len(test_title.index))

In [19]:
train_title = train_title.loc[:, train_title.sum(axis=0) > 1]

In [20]:
#First 5 rows and feature names obtained with the vectoritzer
print(train_title[0:5])

        23      37      55      69      81      101     113     125     \
200000       0       0       0       0       0       0       0       0   
200001       0       0       0       0       0       0       0       0   
200002       0       0       0       0       0       0       0       0   
200003       0       0       0       0       0       0       0       0   
200004       0       0       0       0       0       0       0       0   

        137     165     ...  127511  127516  127528  127536  127568  127570  \
200000       0       0  ...       0       0       0       0       0       0   
200001       0       0  ...       0       0       0       0       0       0   
200002       0       0  ...       0       0       0       0       0       0   
200003       0       0  ...       0       0       0       0       0       0   
200004       0       0  ...       0       0       0       0       0       0   

        127572  127709  128021  128028  
200000       0       0       0       0 

In [21]:
train_desc = descs.tail(len(test_desc.index))

In [22]:
train_desc = train_desc.loc[:, train_desc.sum(axis=0) > 1]

In [23]:
#First 5 rows and feature names obtained with the vectoritzer
print(train_desc[0:5])

        0       83      111     132     147     154     169     181     \
200000       0       0       0       0       0       0       0       0   
200001       0       0       0       0       0       0       0       0   
200002       0       0       0       0       0       0       0       0   
200003       0       0       0       0       0       0       0       0   
200004       0       0       0       0       0       0       0       0   

        192     214     ...  313673  314371  314605  314629  314810  315442  \
200000       0       0  ...       0       0       0       0       0       0   
200001       0       0  ...       0       0       0       0       0       0   
200002       0       0  ...       0       0       0       0       0       0   
200003       0       0  ...       0       0       0       0       0       0   
200004       0       0  ...       0       0       0       0       0       0   

        315478  315556  315560  315718  
200000       0       0       0       0 

In [25]:
from scipy.sparse import csr_matrix #for min-max scaling sparse matrices

In [26]:
def normalize(df): #function for min-max scaling of dataframes
    result = df.copy()
    for feature_name in df.columns:
        max_value = csr_matrix(df[feature_name]).max()
        min_value = csr_matrix(df[feature_name]).min()
        result[feature_name] = (df[feature_name] - min_value) / max((max_value - min_value), 1)
    return result

In [27]:
test_title = normalize(test_title) #min-max scale test_title

In [28]:
print(test_title[0:5])

   88      101     177     184     191     206     234     246     267     \
0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

   269     ...  127477  127487  127516  127536  127567  127570  127654  \
0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

   127678  127921  128028  
0     0.0     0.0     0.0  
1     0.0     0.0     0.0  
2     0.

In [29]:
test_desc = normalize(test_desc) #min-max scale test_desc

In [30]:
print(test_desc[0:5])

   0       1       56      70      83      84      111     132     147     \
0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

   154     ...  313582  313671  313731  314204  314499  315294  315478  \
0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

   315615  315709  315723  
0     0.0     0.0     0.0  
1     0.0     0.0     0.0  
2     0.

In [31]:
train_title = normalize(train_title) #min-max scale train_title

In [32]:
print(train_title[0:5])

        23      37      55      69      81      101     113     125     \
200000     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
200001     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
200002     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
200003     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
200004     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

        137     165     ...  127511  127516  127528  127536  127568  127570  \
200000     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
200001     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
200002     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
200003     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
200004     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   

        127572  127709  128021  128028  
200000     0.0     0.0     0.0     0.0 

In [33]:
train_desc = normalize(train_desc) #min-max scale train_desc

In [34]:
print(train_desc[0:5])

        0       83      111     132     147     154     169     181     \
200000     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
200001     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
200002     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
200003     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
200004     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

        192     214     ...  313673  314371  314605  314629  314810  315442  \
200000     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
200001     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
200002     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
200003     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
200004     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   

        315478  315556  315560  315718  
200000     0.0     0.0     0.0     0.0 

In [45]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Flatten, Conv1D, MaxPooling1D, Dropout
from tensorflow.keras.optimizers import SGD
import matplotlib.pyplot as plt

In [46]:
input_shape = train_desc.head(1).shape

model = Sequential()
model.add(Conv1D(32, 1, activation='relu', input_shape=input_shape))
model.add(MaxPooling1D(pool_size=1))
model.add(Conv1D(64, 1, activation='relu'))
model.add(MaxPooling1D(pool_size=1))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))
model.compile(loss='mse', optimizer='adam', metrics=['mae'])

In [47]:
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_4 (Conv1D)            (None, 1, 32)             874112    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 1, 32)             0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 1, 64)             2112      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 1, 64)             0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout (Dropout)            (None, 128)              

In [48]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
patience=10
early_stopping=EarlyStopping(patience=patience, verbose=1)
checkpointer=ModelCheckpoint(filepath='model.hdf5', save_best_only=True, verbose=1)
tb = TensorBoard(log_dir='logs', histogram_freq=1, write_graph=1)

In [50]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

network_history = model.fit(train_desc, train_label, batch_size=128, epochs=30, verbose=1, validation_split=0.2, callbacks=[early_stopping, checkpointer, tb])

MemoryError: Unable to allocate 5.09 GiB for an array with shape (27315, 25000) and data type float64

Our **X** will be the sparse matrices and the feature names from the title and the description, and **Y** will be the labels provided in the csv file.

Validation split is obtained later in the model.fit() function from training data