In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import string

In [None]:
data = pd.read_csv("/kaggle/input/bangla-news-dataset/final_dataset.csv", encoding="utf-8")
data.drop_duplicates(inplace=True)
data = data.dropna()
data = data.drop(columns=['Headline'], axis=1)
data.head()

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)

    tokens = [word for word in tokens if word not in string.punctuation]

    stop_words = set(stopwords.words('bengali'))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    return ' '.join(tokens)

data['News'] = data['News'].apply(preprocess_text)

In [None]:
X = data['News']
y = data['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
from sklearn.svm import SVC

svm_classifier = SVC(kernel='rbf', C=1.0)

svm_classifier.fit(X_train_tfidf, y_train)

In [None]:
y_pred = svm_classifier.predict(X_test_tfidf)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
print(classification_report(y_test, y_pred))