Import libraries and data loading

In [None]:
!pip install nltk

In [None]:
import os
import pandas as pd
import numpy as np
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [110]:
# Function to load data from multiple folders
def load_data_from_folders(folder_paths):
    data = []
    labels = []
    for folder_path in folder_paths:
        category = os.path.basename(folder_path)  # Extract category name from folder path
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='latin-1') as file:
                text = file.read()
                data.append(text)
                labels.append(category)
    return data, labels

In [111]:
folder_paths = ["bbc/tech", "bbc/sport", "bbc/politics", "bbc/entertainment", "bbc/business"]
data, labels = load_data_from_folders(folder_paths)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
df = pd.DataFrame({'text': data, 'category': labels})

In [112]:
print(df.head())

Unnamed: 0,text,category
0,Ink helps drive democracy in Asia\n\nThe Kyrgy...,tech
1,China net cafe culture crackdown\n\nChinese au...,tech
2,Microsoft seeking spyware trojan\n\nMicrosoft ...,tech
3,Digital guru floats sub-$100 PC\n\nNicholas Ne...,tech
4,Technology gets the creative bug\n\nThe hi-tec...,tech


Preprocessing 

In [113]:

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Step 2: Data Preprocessing
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing Punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Removing Stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Apply preprocessing to the 'text' column of the DataFrame
df['text'] = df['text'].apply(preprocess_text)

# Display the preprocessed text
print(df.head())


                                                text category
0  ink help drive democraci asia kyrgyz republ sm...     tech
1  china net cafe cultur crackdown chines author ...     tech
2  microsoft seek spywar trojan microsoft investi...     tech
3  digit guru float sub- 100 pc nichola negropont...     tech
4  technolog get creativ bug hi-tech art world ti...     tech


In [133]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

# Step 3: Feature Engineering

# 1. Word Frequency Feature
count_vectorizer = CountVectorizer()
word_freq_features = count_vectorizer.fit_transform(df['text'])

# 2. TF-IDF Feature
tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(df['text'])

# 3. N-grams Feature
ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))
ngram_features = ngram_vectorizer.fit_transform(df['text'])

#Compute Cosine Similarity between documents
cos_sim_features = cosine_similarity(tfidf_features)

# Display the updated feature matrix
print("Word Frequency Feature Matrix Shape:", word_freq_features.shape)
print("TF-IDF Feature Matrix Shape:", tfidf_features.shape)
print("N-grams Feature Matrix Shape:", ngram_features.shape)


Word Frequency Feature Matrix Shape: (2225, 21060)
TF-IDF Feature Matrix Shape: (2225, 21060)
N-grams Feature Matrix Shape: (2225, 339763)


In [134]:
print(cos_sim_features)

[[1.         0.02940013 0.01321327 ... 0.00906117 0.03338321 0.01651119]
 [0.02940013 1.         0.02737986 ... 0.01591812 0.03581663 0.00796929]
 [0.01321327 0.02737986 1.         ... 0.01262462 0.01649719 0.00827267]
 ...
 [0.00906117 0.01591812 0.01262462 ... 1.         0.10009538 0.01908958]
 [0.03338321 0.03581663 0.01649719 ... 0.10009538 1.         0.02502272]
 [0.01651119 0.00796929 0.00827267 ... 0.01908958 0.02502272 1.        ]]


In [135]:
from scipy.sparse import hstack

# 1. Combine Features
# Combine the extracted features into a single feature matrix
# Use hstack() from scipy.sparse to horizontally stack the feature matrices
combined_features = hstack([word_freq_features, tfidf_features, ngram_features, cos_sim_features])

# Check the shape of the combined feature matrix
print("Combined Features Matrix Shape:", combined_features.shape)

Combined Features Matrix Shape: (2225, 384108)


In [146]:
print(combined_features)

  (0, 10082)	25.0
  (0, 9252)	1.0
  (0, 6501)	3.0
  (0, 5870)	2.0
  (0, 2403)	1.0
  (0, 11096)	4.0
  (0, 15868)	4.0
  (0, 17432)	1.0
  (0, 12964)	1.0
  (0, 17921)	1.0
  (0, 7921)	2.0
  (0, 17640)	2.0
  (0, 19869)	12.0
  (0, 10237)	2.0
  (0, 19465)	3.0
  (0, 15529)	3.0
  (0, 5273)	3.0
  (0, 6782)	15.0
  (0, 14216)	3.0
  (0, 15012)	1.0
  (0, 13047)	1.0
  (0, 20208)	1.0
  (0, 13331)	1.0
  (0, 18658)	2.0
  (0, 4239)	4.0
  :	:
  (2224, 384083)	0.03707898166235673
  (2224, 384084)	0.0237400086033047
  (2224, 384085)	0.017983615579618806
  (2224, 384086)	0.022750852156378235
  (2224, 384087)	0.009127948773602065
  (2224, 384088)	0.02782863310988535
  (2224, 384089)	0.027153315759018946
  (2224, 384090)	0.03844869823162835
  (2224, 384091)	0.05445865444742223
  (2224, 384092)	0.011130000010751866
  (2224, 384093)	0.01022090111775944
  (2224, 384094)	0.016418050251233055
  (2224, 384095)	0.012402764882923798
  (2224, 384096)	0.014271460106339163
  (2224, 384097)	0.032760851071717945
  (2224, 38

In [144]:
from sklearn.feature_selection import SelectKBest, chi2

# Perform feature selection using SelectKBest
# Here, we'll select the top 2000 features
k_best_selector = SelectKBest(score_func=chi2, k=3000)
selected_features = k_best_selector.fit_transform(combined_features, y)

# Check the shape of the selected feature matrix
print("Selected Features Matrix Shape:", selected_features.shape)


Selected Features Matrix Shape: (2225, 3000)


In [145]:
X = selected_features  # Selected features matrix
y = df['category']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Model Selection
classifier = SVC()  # Example classifier, you can use any other classifier

# 3. Model Training
classifier.fit(X_train, y_train)

# 4. Model Evaluation
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9483146067415731


In [147]:
from sklearn.metrics import classification_report

classification_rep = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_rep)


Classification Report:
               precision    recall  f1-score   support

     business       0.97      0.91      0.94       102
entertainment       1.00      0.92      0.96        76
     politics       0.95      0.94      0.94        78
        sport       0.97      0.98      0.98       104
         tech       0.87      0.99      0.92        85

     accuracy                           0.95       445
    macro avg       0.95      0.95      0.95       445
 weighted avg       0.95      0.95      0.95       445



In [148]:

from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import classification_report

# Assuming you have already trained your classifier
# Let's assume your classifier is named 'classifier'

# Define the number of folds for cross-validation
k_folds = 5

# Define the cross-validation strategy (k-fold)
kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Perform cross-validation and get accuracy scores for each fold
cv_scores = cross_val_score(classifier, X_train, y_train, cv=kfold)

# Print the accuracy scores for each fold
print("Cross-Validation Scores:", cv_scores)

# Calculate and print the mean accuracy and standard deviation
print("Mean Accuracy:", np.mean(cv_scores))
print("Standard Deviation of Accuracy:", np.std(cv_scores))

# Generate predictions on the test data using the trained classifier
y_pred = classifier.predict(X_test)

# Generate a classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("\nClassification Report:")
print(report)


Cross-Validation Scores: [0.95786517 0.94662921 0.95786517 0.95505618 0.96348315]
Mean Accuracy: 0.9561797752808989
Standard Deviation of Accuracy: 0.005504471332097002

Classification Report:
               precision    recall  f1-score   support

     business       0.97      0.91      0.94       102
entertainment       1.00      0.92      0.96        76
     politics       0.95      0.94      0.94        78
        sport       0.97      0.98      0.98       104
         tech       0.87      0.99      0.92        85

     accuracy                           0.95       445
    macro avg       0.95      0.95      0.95       445
 weighted avg       0.95      0.95      0.95       445

