In [1]:
import pandas as pd
import numpy as np
import string
import re
import random
import numpy as np
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer


In [2]:
df=pd.read_csv(r"D:\Capstone\Datasets\Data_preparation\preprocessed\Preprocessed_4000_samples.csv")


In [3]:
import ast

df['text_lemmatized'] = df['text_lemmatized'].apply(ast.literal_eval)

# Now you can proceed with concatenating the lists into sentences
df['sentences'] = df['text_lemmatized'].apply(lambda x: ' '.join(x))



In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# Count Vectorizer
vect = CountVectorizer()  
vects = vect.fit_transform(df.sentences)
vects.shape

(4072, 31073)

In [5]:
tdm = vects.T

term_document_matrix = pd.DataFrame.sparse.from_spmatrix(tdm, index=vect.get_feature_names_out(), columns=df['docid'].astype(str))

print(term_document_matrix.shape)

(31073, 4072)


## Max Normalization

In [6]:
import numpy as np

max_values = term_document_matrix.max(axis=0)
normalized_term_document_matrix =np.divide(term_document_matrix, max_values)
normalized_term_document_matrix=normalized_term_document_matrix.values


## TruncatedSVD

In [7]:
from sklearn.decomposition import TruncatedSVD
import numpy as np

k = 200 
svd = TruncatedSVD(n_components=k)

U = svd.fit_transform(normalized_term_document_matrix)
S = np.diag(svd.singular_values_)  
VT = svd.components_


S_inv = np.linalg.inv(S)

print("Inverse of S:")
print(S_inv)

Inverse of S:
[[0.01605977 0.         0.         ... 0.         0.         0.        ]
 [0.         0.038631   0.         ... 0.         0.         0.        ]
 [0.         0.         0.0407791  ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.20090246 0.         0.        ]
 [0.         0.         0.         ... 0.         0.20144884 0.        ]
 [0.         0.         0.         ... 0.         0.         0.20252178]]


In [8]:
Xq_prime = normalized_term_document_matrix.T

print("Xq_prime shape: ",Xq_prime.shape)
print("U shape: ",U.shape)
print("S_inv shape: ",S_inv.shape)


Xq_prime shape:  (4072, 31073)
U shape:  (31073, 200)
S_inv shape:  (200, 200)


In [9]:
Dq = Xq_prime.dot(U).dot(S_inv)
Dq.shape

(4072, 200)

In [10]:
a=list(df['encoded_topics'].unique())
len(a)

10

In [11]:
class_mapping = {old_label: new_label for new_label, old_label in enumerate(df['encoded_topics'].unique())}
df['encoded_topics'] = df['encoded_topics'].map(class_mapping)
df['encoded_topics'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64)

In [12]:
reverse_class_mapping = {v: k for k, v in class_mapping.items()}
print("Old Class Categories:")
for encoded_label, old_label in reverse_class_mapping.items():
    print(f"Encoded Label {encoded_label}: Original Label {old_label}")


Old Class Categories:
Encoded Label 0: Original Label 0
Encoded Label 1: Original Label 55
Encoded Label 2: Original Label 4
Encoded Label 3: Original Label 27
Encoded Label 4: Original Label 6
Encoded Label 5: Original Label 13
Encoded Label 6: Original Label 20
Encoded Label 7: Original Label 41
Encoded Label 8: Original Label 69
Encoded Label 9: Original Label 52


## Model

## Cross Validation

In [13]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold

num_folds = 10

# Initialize cross-validation
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize variables to store test accuracies
test_accuracies = []

# Iterate over each fold
for fold, (train_index, test_index) in enumerate(skf.split(Dq, df['encoded_topics'])):
    print(f'Fold {fold + 1}/{num_folds}')

    # Split data into train and test sets for this fold
    X_train, X_test = Dq[train_index], Dq[test_index]
    y_train, y_test = df['encoded_topics'].iloc[train_index], df['encoded_topics'].iloc[test_index]

    num_classes = len(np.unique(np.concatenate((y_train, y_test))))

    y_train_encoded = to_categorical(y_train, num_classes=num_classes)
    y_test_encoded = to_categorical(y_test, num_classes=num_classes)

    X_train_flattened = X_train.reshape((X_train.shape[0], -1))
    X_test_flattened = X_test.reshape((X_test.shape[0], -1))

    # Define the model architecture
    model = Sequential([
        Dense(512, activation='relu', input_shape=(X_train_flattened.shape[1],)),
        Dropout(0.5),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Define early stopping criteria
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model with early stopping
    history = model.fit(X_train_flattened, y_train_encoded, epochs=100, batch_size=64, 
                        validation_split=0.2, callbacks=[early_stopping], verbose=0)

    # Evaluate the model on the test set
    test_loss, test_accuracy = model.evaluate(X_test_flattened, y_test_encoded)
    print(f'Test accuracy for fold {fold + 1}: {test_accuracy*100:.2f}%')

    # Store the test accuracy for this fold
    test_accuracies.append(test_accuracy)

# Calculate and print the average test accuracy across all folds
avg_test_accuracy = np.mean(test_accuracies)
print(f'Average test accuracy: {avg_test_accuracy*100:.2f}%')

Fold 1/10
Test accuracy for fold 1: 79.41%
Fold 2/10
Test accuracy for fold 2: 78.43%
Fold 3/10
Test accuracy for fold 3: 77.89%
Fold 4/10
Test accuracy for fold 4: 80.34%
Fold 5/10
Test accuracy for fold 5: 79.36%
Fold 6/10
Test accuracy for fold 6: 81.82%
Fold 7/10
Test accuracy for fold 7: 82.06%
Fold 8/10
Test accuracy for fold 8: 79.36%
Fold 9/10
Test accuracy for fold 9: 78.13%
Fold 10/10
Test accuracy for fold 10: 81.57%
Average test accuracy: 79.84%


In [14]:
from sklearn.metrics import classification_report
from numpy import argmax, unique
import numpy as np


predictions = model.predict(X_test_flattened)  # Ensure this matches your reshaped test data variable


predictions_int = argmax(predictions, axis=1)


y_test_int = argmax(y_test_encoded, axis=1)


all_classes = unique(np.concatenate((y_test_int, predictions_int)))


target_names = [f'Class {i+1}' for i in all_classes]  


report = classification_report(y_test_int, predictions_int, target_names=target_names)

print(report)


              precision    recall  f1-score   support

     Class 1       0.65      0.65      0.65        23
     Class 2       0.63      0.60      0.62        20
     Class 3       0.82      0.82      0.82        65
     Class 4       0.83      0.89      0.86        44
     Class 5       0.83      0.56      0.67        34
     Class 6       0.78      0.96      0.86        67
     Class 7       0.84      0.70      0.76        23
     Class 8       0.79      0.71      0.75        31
     Class 9       0.87      0.94      0.90        50
    Class 10       0.96      0.90      0.93        50

    accuracy                           0.82       407
   macro avg       0.80      0.77      0.78       407
weighted avg       0.82      0.82      0.81       407



In [15]:
num_folds = 10

# Initialize cross-validation
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize variables to store test accuracies
test_accuracies = []

# Iterate over each fold
for fold, (train_index, test_index) in enumerate(skf.split(Dq, df['encoded_topics'])):
    print(f'Fold {fold + 1}/{num_folds}')

    # Split data into train and test sets for this fold
    X_train, X_test = Dq[train_index], Dq[test_index]
    y_train, y_test = df['encoded_topics'].iloc[train_index], df['encoded_topics'].iloc[test_index]

    num_classes = len(np.unique(np.concatenate((y_train, y_test))))

    y_train_encoded = to_categorical(y_train, num_classes=num_classes)
    y_test_encoded = to_categorical(y_test, num_classes=num_classes)

    X_train_flattened = X_train.reshape((X_train.shape[0], -1))
    X_test_flattened = X_test.reshape((X_test.shape[0], -1))

    # Define the model architecture
    model = Sequential([
        Dense(512, activation='relu', input_shape=(X_train_flattened.shape[1],)),
        Dropout(0.5),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Define early stopping criteria
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model with early stopping
    history = model.fit(X_train_flattened, y_train_encoded, epochs=100, batch_size=64, 
                        validation_split=0.2, callbacks=[early_stopping], verbose=0)

    # Evaluate the model on the test set
    test_loss, test_accuracy = model.evaluate(X_test_flattened, y_test_encoded)
    print(f'Test accuracy for fold {fold + 1}: {test_accuracy*100:.2f}%')

    # Store the test accuracy for this fold
    test_accuracies.append(test_accuracy)

# Calculate and print the average test accuracy across all folds
avg_test_accuracy = np.mean(test_accuracies)
print(f'Average test accuracy: {avg_test_accuracy*100:.2f}%')

Fold 1/10
Test accuracy for fold 1: 79.17%
Fold 2/10
Test accuracy for fold 2: 79.66%
Fold 3/10
Test accuracy for fold 3: 77.89%
Fold 4/10
Test accuracy for fold 4: 78.38%
Fold 5/10
Test accuracy for fold 5: 77.89%
Fold 6/10
Test accuracy for fold 6: 81.08%
Fold 7/10
Test accuracy for fold 7: 82.56%
Fold 8/10
Test accuracy for fold 8: 79.12%
Fold 9/10
Test accuracy for fold 9: 80.59%
Fold 10/10
Test accuracy for fold 10: 81.33%
Average test accuracy: 79.76%


In [16]:
from sklearn.metrics import classification_report
from numpy import argmax, unique
import numpy as np


predictions = model.predict(X_test_flattened)  # Ensure this matches your reshaped test data variable


predictions_int = argmax(predictions, axis=1)


y_test_int = argmax(y_test_encoded, axis=1)


all_classes = unique(np.concatenate((y_test_int, predictions_int)))


target_names = [f'Class {i+1}' for i in all_classes]  


report = classification_report(y_test_int, predictions_int, target_names=target_names)

print(report)


              precision    recall  f1-score   support

     Class 1       0.67      0.61      0.64        23
     Class 2       0.50      0.55      0.52        20
     Class 3       0.93      0.77      0.84        65
     Class 4       0.79      0.95      0.87        44
     Class 5       0.79      0.68      0.73        34
     Class 6       0.78      0.96      0.86        67
     Class 7       0.75      0.65      0.70        23
     Class 8       0.79      0.71      0.75        31
     Class 9       0.87      0.94      0.90        50
    Class 10       0.98      0.86      0.91        50

    accuracy                           0.81       407
   macro avg       0.78      0.77      0.77       407
weighted avg       0.82      0.81      0.81       407

