In [3]:
import pandas as pd
import numpy as np

data = pd.read_csv('../Dataset/combined_final_dataset.csv')
data.head(10)

#check unique categories
data['main_category'].unique()


array(['News', 'Research Paper', 'Code', 'Medical', 'Legal',
       'Financial documents'], dtype=object)

In [4]:
data['main_category'].value_counts()


main_category
Legal                  350
Financial documents    350
Medical                349
News                   324
Research Paper          61
Code                    33
Name: count, dtype: int64

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
#import multinomial naive bayes model
from sklearn.naive_bayes import MultinomialNB

# Drop rows with NaN values in the 'description' column
data = data.dropna(subset=['description'])

# Instantiate CountVectorizer
vectorizer = CountVectorizer()

# Transform the 'description' column
X = vectorizer.fit_transform(data['description'])
y = data['main_category']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = MultinomialNB()
classifier.fit(X_train, y_train)

accuracy = classifier.score(X_test, y_test)
print("Accuracy:", accuracy)




Accuracy: 0.9554794520547946


In [9]:
#classification report
from sklearn.metrics import classification_report
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))


                     precision    recall  f1-score   support

               Code       1.00      1.00      1.00         6
Financial documents       0.88      0.95      0.92        63
              Legal       0.95      1.00      0.97        74
            Medical       0.99      1.00      0.99        74
               News       1.00      0.83      0.91        59
     Research Paper       1.00      1.00      1.00        16

           accuracy                           0.96       292
          macro avg       0.97      0.96      0.97       292
       weighted avg       0.96      0.96      0.95       292



In [11]:

new_description = ["Loan approved rs 2000 to pay"]
new_description_vectorized = vectorizer.transform(new_description)
prediction = classifier.predict(new_description_vectorized)
print("Prediction:", prediction)



Prediction: ['Financial documents']


In [24]:

new_description = ["The country’s security cabinet officially declared that the country is at war with Palestine. The conflict has led to the death of over 700 Israelis and thousands of others have been injured. Meanwhile, the death toll in Gaza has also risen to 493, suggest reports. US President Joe Biden "]
new_description_vectorized = vectorizer.transform(new_description)
prediction = classifier.predict(new_description_vectorized)
print("Prediction:", prediction)



Prediction: ['News']


In [25]:

new_description = ["Arrhythmia or irregular heartbeat is a condition in which the heart  is unable to pump blood to the body efficiently. Symptoms of arrhythmia include: Fluttering in the chest Pounding heartbeat "]
new_description_vectorized = vectorizer.transform(new_description)
prediction = classifier.predict(new_description_vectorized)
print("Prediction:", prediction)



Prediction: ['Code']


In [15]:

new_description = ["class FactorialCalculator: def calculate_factorial(self, n):  if n == 0:   return 1 else:  result = 1  for i in range(1, n + 1):   result *= i return result"]
new_description_vectorized = vectorizer.transform(new_description)
prediction = classifier.predict(new_description_vectorized)
print("Prediction:", prediction)



Prediction: ['News']
