LOADING DATASET

In [20]:
import pandas as pd

#creating train dataset using random rows from the combined dataset
df = pd.read_csv(r'G:\Diksha\7 sem\Final yearproject\NLP-text-doc-classification\Dataset\combined_final_dataset.csv')
df = df.sample(n=1367, random_state=1)
#save the csv to dataset folder
df.to_csv('combined_final_dataset_train.csv', index=False) #saving the training dataset


In [21]:
#creating test dataset using random rows from the combined dataset
df = pd.read_csv(r'G:\Diksha\7 sem\Final yearproject\NLP-text-doc-classification\Dataset\combined_final_dataset.csv')
df = df.sample(n=100, random_state=1)
#save in datset folder

df.to_csv('combined_final_dataset_test.csv', index=False) #saving the test csv


In [13]:
df_train = pd.read_csv('combined_final_dataset_train.csv') # reading the train csv
df_test = pd.read_csv('combined_final_dataset_test.csv') # reading the test csv

CLEANING DATA FOR NECESSARY USE

In [14]:
df_test = df_test.drop(['Title','category','date'],axis = 1)#dropping the columns which are not required

In [15]:
df_train = df_train.drop(['Title','category','date'],axis = 1) #dropping the columns which are not required

In [16]:
#encoding the main category column
encoded_dict = {
    'News': 0,
    'Research Paper': 1,
    'Code': 2,
    'Medical': 3,
    'Legal': 4,
    'Financial documents': 5
}
#mapping the encoded values to the main category column
df_train['main_category'] = df_train['main_category'].map(encoded_dict)
df_test['main_category'] = df_test['main_category'].map(encoded_dict)


In [17]:
from tensorflow.keras.utils import to_categorical
from keras.utils import to_categorical 




PREPROCESSING THE DATA

In [18]:
#one hot encoding the main category column
y_train = to_categorical(df_train.main_category)
y_test = to_categorical(df_test.main_category)

In [None]:
import transformers
from transformers import AutoTokenizer,TFBertModel #importing the tokenizer and bert model
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') #loading the tokenizer
bert = TFBertModel.from_pretrained('bert-base-cased') 

In [None]:
#datatypes of description
df_train.description.dtype
#convert desription column to string
df_train['description'] = df_train['description'].astype(str)
df_test['description'] = df_test['description'].astype(str)


In [None]:
# Tokenizing the text--description column
x_train = tokenizer(
    text=df_train.description.tolist(),
    add_special_tokens=True,
    max_length=300,
    truncation=True,
    padding=True,  # Add padding to the text so it can be used as an input 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True, # Returns a tensor containing the attention mask for the 2nd sentence of the pair if it exists, 0 otherwise.
    verbose = True)
x_test = tokenizer(
    text=df_test.description.tolist(),
    add_special_tokens=True,
    max_length=300,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [None]:
input_ids = x_train['input_ids']  #token ids
attention_mask = x_train['attention_mask'] #attention mask is used to ignore the padded tokens in the sequence

In [None]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

MODEL FINE TUNING

In [None]:
max_len = 300  #maximum length of the sequence
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0] 
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)  #pooling layer to reduce the dimensionality of the extracted features
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(6,activation = 'sigmoid')(out)
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True #setting the bert layer to trainable

In [None]:
optimizer = Adam(
    learning_rate=5e-05, # this learning rate is for bert model , taken from huggingface website 
    epsilon=1e-08,
    clipnorm=1.0)
# Set loss and metrics
loss =CategoricalCrossentropy(from_logits = True)
metric = CategoricalAccuracy('balanced_accuracy'),
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

In [None]:
train_history = model.fit(
    x ={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']} , # input ids and attention mask are the inputs to the bert model
    y = y_train, 
    validation_data = (
    {'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']}, y_test
    ),
  epochs=1,
    batch_size=36
)

In [None]:
predicted_raw = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})
predicted_raw[0] # predicted probabilities for the first row

In [None]:
import numpy as np
y_predicted = np.argmax(predicted_raw, axis = 1) # predicted class for the first row
y_true = df_test.main_category

MODEL METRICS

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_predicted))

PEDICTION OF CLASS CODE

In [None]:
texts = 'Arrhythmia or irregular heartbeat is a condition in which the heart  is unable to pump blood to the body efficiently. Symptoms of arrhythmia include: Fluttering in the chest Pounding heartbeat'
x_val = tokenizer(
    text=texts,
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding='max_length', 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True) 
validation = model.predict({'input_ids':x_val['input_ids'],'attention_mask':x_val['attention_mask']})*100
for key , value in zip(encoded_dict.keys(),validation[0]):
    print(key,value)