https://www.kaggle.com/datasets/yufengdev/bbc-fulltext-and-category

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc, classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
import time
from collections import Counter, defaultdict
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline

In [None]:
bbc = pd.read_csv("/Users/sharad/Courses/DATA_5600/Data/bbc_preprocessed_data.csv")
print(bbc.shape)
bbc.head()

In [None]:
Counter(bbc.category)

In [None]:
511/len(bbc)

In [None]:
Xtrain, Xtest, ytrain_label, ytest_label = train_test_split(bbc['text_final'], 
                                                            bbc['category'], 
                                                            random_state=4,
                                                            test_size=0.2)

In [None]:
encoder = preprocessing.LabelEncoder()

encoder.fit(ytrain_label)
ytrain = encoder.transform(ytrain_label)

encoder.fit(ytest_label)
ytest = encoder.transform(ytest_label)

In [None]:
tfidf_vect = TfidfVectorizer(max_features=5000)
tfidf_vect.fit(bbc['text_final'])
Xtrain_tfidf = tfidf_vect.transform(Xtrain)
Xtest_tfidf = tfidf_vect.transform(Xtest)

print(tfidf_vect.vocabulary_)

In [None]:
print(Xtrain_tfidf)

## Training an SVC

In [None]:
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

Naive = naive_bayes.MultinomialNB()
Naive.fit(Xtrain_tfidf,ytrain)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Xtest_tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, ytest)*100)

In [None]:
svc = SVC(C=0.01, kernel='linear')
cross_val_score(svc, Xtrain_tfidf, ytrain, cv=5)

In [None]:
params_grid = [{'kernel': ['rbf'],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

In [None]:
svm_model = GridSearchCV(SVC(), params_grid, cv=5, n_jobs=6, verbose=4)
svm_model.fit(Xtrain_tfidf, ytrain)

In [None]:
svm_model.cv_results_

In [None]:
print('Best score for training data:', svm_model.best_score_)

In [None]:
print('Best C:',svm_model.best_estimator_.C) 
print('Best Kernel:',svm_model.best_estimator_.kernel)
print('Best Gamma:',svm_model.best_estimator_.gamma)

In [None]:
final_model = svm_model.best_estimator_
pred = final_model.predict(Xtest_tfidf)
pred_label = list(encoder.inverse_transform(pred))
pred_label

In [None]:
print(confusion_matrix(ytest_label,pred_label))

In [None]:
print(classification_report(ytest_label,pred_label))

In [None]:
accuracy_score(ytest_label, pred_label)