#Importing datasets

In [None]:
# https://drive.google.com/file/d/1pmNSD1nbYHEAiP065s4akRXHMWFs9Dqw/view?usp=sharing DBpedia train.csv
# https://drive.google.com/file/d/1mKededzdbJsWQnwsu-R_WSILYSvNEY7c/view?usp=sharing DBpedia test.csv
!pip install gdown 
!gdown --id 1pmNSD1nbYHEAiP065s4akRXHMWFs9Dqw --output train.csv  #import train.csv from drive
!gdown --id 1mKededzdbJsWQnwsu-R_WSILYSvNEY7c --output test.csv   #import test.csv from drive

Downloading...
From: https://drive.google.com/uc?id=1pmNSD1nbYHEAiP065s4akRXHMWFs9Dqw
To: /content/train.csv
100% 174M/174M [00:01<00:00, 127MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1mKededzdbJsWQnwsu-R_WSILYSvNEY7c
To: /content/test.csv
100% 21.8M/21.8M [00:00<00:00, 133MB/s]


In [None]:
import numpy as np
import pandas as pd

train_data = pd.read_csv('train.csv', encoding='utf8',header=None) #read csv to dataframe
test_data = pd.read_csv('test.csv', encoding='utf8',header=None)   #read csv to dataframe

train_data.where(train_data[0] < 6, inplace = True)  #select first 5 categories
train_data = train_data[train_data[0].notnull()]     #remove NaN values

test_data.where(test_data[0] < 6, inplace = True)    #select first 5 categories
test_data = test_data[test_data[0].notnull()]        #remove NaN values

In [None]:
train_data.sample(5) #sample from train data

Unnamed: 0,0,1,2
185848,5.0,Joe Bock (academic),Joe Bock is an official with the University o...
2424,1.0,Durtro,Durtro was a British independent record label...
113008,3.0,Adam Gregory,Adam Gregory (born July 12 1985) is a Canadia...
166365,5.0,Max Bradford,Maxwell Robert (Max) Bradford (born 19 Januar...
160100,5.0,George Strahan,Major Sir George Cumine Strahan KCMG (9 Decem...


In [None]:
test_data.sample(5) #sample from test data

Unnamed: 0,0,1,2
134,1.0,Pass Transit,PASS Transit is a transit agency in Riverside...
3761,1.0,Just Add Water (company),Just Add Water is an Independent video game d...
10557,3.0,Robert Penn Warren,Robert Penn Warren (April 24 1905 – September...
9548,2.0,Hunter High School (Scotland),Hunter High School was a non-denominational s...
3934,1.0,Visco Corporation,Visco Corporation (株式会社ビスコ) is a Japanese sof...


In [None]:
train_label = pd.to_numeric(train_data.iloc[:,0]) #select labels (int) from train data
train_text = train_data.iloc[:,1:3] #select article text from train data

In [None]:
test_label = pd.to_numeric(test_data.iloc[:,0]) #select labels (int) from test data
test_text = test_data.iloc[:,1:3] #select article text from train data

#Pre-processing

In [None]:
from sklearn.feature_extraction.text import CountVectorizer #vectorizer for article text data
from nltk.corpus import stopwords #stopwords for desktop usage
import nltk
nltk.download('stopwords') #stopwords for collab notebook

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
vectorizer = CountVectorizer(stop_words=stopwords.words('english'), analyzer='word', ngram_range=(1, 1)) #vectorizer for words, removing stopwords

In [None]:
test_title = test_text.iloc[:,0] #select titles from test text
test_desc = test_text.iloc[:,1]  #select descriptions from test text

In [None]:
train_title = train_text.iloc[:,0] #select titles from train text
train_desc = train_text.iloc[:,1]  #select descriptions from train text

In [None]:
titles = pd.concat([test_title, train_title]) #concat titles for vectorazition
descs = pd.concat([test_desc, train_desc]) #concat descriptions for vectorazition

In [None]:
titles = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(titles)) #vectorizing test_title and adding it to a dataframe
title_feature_names = np.asarray(vectorizer.get_feature_names()) #getting all feature names for test_title
descs = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(descs)) #vectorizing test_desc and adding it to a dataframe
desc_feature_names = np.asarray(vectorizer.get_feature_names()) #getting all feature names for test_desc

In [None]:
test_title = titles[0:len(test_title.index)]

In [None]:
test_title = test_title.loc[:, test_title.sum(axis=0) > 1]

In [None]:
#First 5 rows and feature names obtained with the vectoritzer
print(test_title[0:5])
print(title_feature_names[0:5])

   88      101     177     184     ...  127654  127678  127921  128028
0       0       0       0       0  ...       0       0       0       0
1       0       0       0       0  ...       0       0       0       0
2       0       0       0       0  ...       0       0       0       0
3       0       0       0       0  ...       0       0       0       0
4       0       0       0       0  ...       0       0       0       0

[5 rows x 6054 columns]
['002' '05' '07' '09' '0verflow']


2536

In [None]:
test_desc = descs.head(len(test_desc.index))

In [None]:
test_desc = test_desc.loc[:, test_desc.sum(axis=0) > 1]

In [None]:
#First 5 rows and feature names obtained with the vectorizer
print(test_desc[0:5])

   0       1       56      70      ...  315478  315615  315709  315723
0       0       0       0       0  ...       0       0       0       0
1       0       0       0       0  ...       0       0       0       0
2       0       0       0       0  ...       0       0       0       0
3       0       0       0       0  ...       0       0       0       0
4       0       0       0       0  ...       0       0       0       0

[5 rows x 34626 columns]


In [None]:
train_title = titles.tail(len(test_title.index))

In [None]:
train_title = train_title.loc[:, train_title.sum(axis=0) > 1]

In [None]:
#First 5 rows and feature names obtained with the vectoritzer
print(train_title[0:5])

        23      37      55      69      ...  127572  127709  128021  128028
200000       0       0       0       0  ...       0       0       0       0
200001       0       0       0       0  ...       0       0       0       0
200002       0       0       0       0  ...       0       0       0       0
200003       0       0       0       0  ...       0       0       0       0
200004       0       0       0       0  ...       0       0       0       0

[5 rows x 5933 columns]


In [None]:
train_desc = descs.tail(len(test_desc.index))

In [None]:
train_desc = train_desc.loc[:, train_desc.sum(axis=0) > 1]

In [None]:
#First 5 rows and feature names obtained with the vectoritzer
print(train_desc[0:5])

        0       83      111     132     ...  315478  315556  315560  315718
200000       0       0       0       0  ...       0       0       0       0
200001       0       0       0       0  ...       0       0       0       0
200002       0       0       0       0  ...       0       0       0       0
200003       0       0       0       0  ...       0       0       0       0
200004       0       0       0       0  ...       0       0       0       0

[5 rows x 27315 columns]


In [None]:
from scipy.sparse import csr_matrix #for min-max scaling sparse matrices

In [None]:
def normalize(df): #function for min-max scaling of dataframes
    result = df.copy()
    for feature_name in df.columns:
        max_value = csr_matrix(df[feature_name]).max()
        min_value = csr_matrix(df[feature_name]).min()
        result[feature_name] = (df[feature_name] - min_value) / max((max_value - min_value), 1)
    return result

In [None]:
test_title = normalize(test_title) #min-max scale test_title

In [None]:
print(test_title[0:5])

   88      101     177     184     ...  127654  127678  127921  128028
0     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
1     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
2     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
3     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
4     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0

[5 rows x 6054 columns]


In [None]:
test_desc = normalize(test_desc) #min-max scale test_desc

In [None]:
print(test_desc[0:5])

   0       1       56      70      ...  315478  315615  315709  315723
0     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
1     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
2     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
3     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
4     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0

[5 rows x 34626 columns]


In [None]:
train_title = normalize(train_title) #min-max scale train_title

In [None]:
print(train_title[0:5])

        23      37      55      69      ...  127572  127709  128021  128028
200000     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
200001     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
200002     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
200003     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
200004     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0

[5 rows x 5933 columns]


In [None]:
train_desc = normalize(train_desc) #min-max scale train_desc

In [None]:
print(train_desc[0:5])

        0       83      111     132     ...  315478  315556  315560  315718
200000     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
200001     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
200002     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
200003     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0
200004     0.0     0.0     0.0     0.0  ...     0.0     0.0     0.0     0.0

[5 rows x 27315 columns]


In [None]:
test_title.to_csv(index=False)
test_desc.to_csv(index=False)
train_title.to_csv(index=False)
train_desc.to_csv(index=False)

Our **X** will be the sparse matrices and the feature names from the title and the description, and **Y** will be the labels provided in the csv file.

Validation split is obtained later in the model.fit() function from training data