In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
file_path = 'test.xlsx'
data = pd.read_excel(file_path)

# Extract the relevant columns
X = data['Text']  # Features (text data)
y = data['Type']  # Labels (Statement, Question, Answer)

# Preprocess the text data using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X_tfidf = tfidf.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.3, random_state=42
)

# Train a k-NN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict the type on the test set
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=1)  # or zero_division=0


# Output the results with proper alignment
print("Model Evaluation Results".center(50, "="))
print(f"\nAccuracy: {accuracy * 100:.2f}%\n")
print("Classification Report:")
print(report)
print("=" * 50)



Accuracy: 80.00%

Classification Report:
              precision    recall  f1-score   support

      Answer       1.00      1.00      1.00         1
    Question       0.71      1.00      0.83         5
   Statement       1.00      0.50      0.67         4

    accuracy                           0.80        10
   macro avg       0.90      0.83      0.83        10
weighted avg       0.86      0.80      0.78        10



In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset from Excel
file_path = 'test.xlsx'  # Ensure the path is correct
data = pd.read_excel(file_path)

# Extract features and labels
X = data['Text']  # Features (text data)
y = data['Type']  # Labels (Statement, Question, Answer)

# Convert the text data into a TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X_tfidf = tfidf.fit_transform(X).toarray()  # Convert to array for NumPy operations

# Separate the data by class
class1_data = X_tfidf[y == 'Statement']
class2_data = X_tfidf[y == 'Question']

# Calculate the mean (centroid) for each class
centroid1 = class1_data.mean(axis=0)
centroid2 = class2_data.mean(axis=0)

# Calculate the standard deviation (spread) for each class
spread1 = class1_data.std(axis=0)
spread2 = class2_data.std(axis=0)

# Calculate the distance between the centroids of the two classes
interclass_distance = np.linalg.norm(centroid1 - centroid2)

# Output the results
print("Centroid 1 (Statement):", centroid1)
print("Centroid 2 (Question):", centroid2)
print("\nSpread 1 (Statement):", spread1)
print("Spread 2 (Question):", spread2)
print("\nInterclass Distance between Statement and Question:", interclass_distance)


Centroid 1 (Statement): [0.01924543 0.01637735 0.         0.         0.         0.
 0.         0.02077157 0.02077157 0.         0.         0.02077157
 0.         0.         0.01924543 0.         0.06666667 0.
 0.         0.0327547  0.         0.         0.         0.01924543
 0.02077157 0.06666667 0.06666667 0.18982242 0.02152196 0.06666667
 0.04154313 0.         0.         0.         0.         0.03464573
 0.02152196 0.05399586 0.         0.         0.06666667 0.
 0.         0.06666667 0.13333333 0.         0.         0.
 0.         0.10207044 0.        ]
Centroid 2 (Question): [0.06923394 0.07465753 0.         0.         0.05242738 0.02680147
 0.04524976 0.         0.         0.10790465 0.05433211 0.
 0.03465908 0.06685026 0.04688178 0.0469479  0.         0.08773191
 0.03099295 0.13357377 0.         0.0469479  0.07189949 0.04688178
 0.         0.         0.         0.04384574 0.         0.
 0.         0.13591589 0.03099295 0.05433211 0.06119194 0.
 0.         0.1756316  0.05285988 0.