In [None]:
!pip install scikit-learn



In [None]:
from sklearn.datasets import fetch_20newsgroups

# Load the train set
newsgroups_train = fetch_20newsgroups(subset='train')

# Load the test set
newsgroups_test = fetch_20newsgroups(subset='test')


In [None]:
# Print information about the dataset
print("Training set size:", len(newsgroups_train.data))
print("Test set size:", len(newsgroups_test.data))
print("Number of categories:", len(newsgroups_train.target_names))
print("Categories:", newsgroups_train.target_names)


Training set size: 11314
Test set size: 7532
Number of categories: 20
Categories: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [None]:
type(newsgroups_train)

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
stops_Wrods = set(stopwords.words('english'))
lemmitizer = WordNetLemmatizer()

def text_preprocessing(text):
    # Convert text to lowercase
    text = text.lower()
    text = ' '.join([lemmitizer.lemmatize(word) for word in word_tokenize(text) if word not in stops_Wrods])
    words = word_tokenize(text)
    words = [lemmitizer.lemmatize(word) for word in words if word.isalpha()]
    return ' '.join(words)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
#Preprocess the traing and test data
newsgroups_train_preprocessed = [text_preprocessing(text) for text in newsgroups_train.data]
newsgroups_test_preprocessed = [text_preprocessing(text) for text in newsgroups_test.data]

In [None]:
# create a TF -IDF vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(newsgroups_train_preprocessed)
X_test = vectorizer.transform(newsgroups_test_preprocessed)

y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [None]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report



In [None]:
from sklearn.metrics import accuracy_score

# Initialize the classifiers
naive_bayes_classifier = MultinomialNB()
svm_classifier = SVC(kernel='linear')
logistic_regression_classifier = LogisticRegression()
random_forest_classifier = RandomForestClassifier()

classifiers = [naive_bayes_classifier, svm_classifier, logistic_regression_classifier, random_forest_classifier]
classifier_names = ['Naive Bayes', 'SVM', 'Logistic Regression', 'Random Forest']

# Iterate over each classifier and evaluate its accuracy
for clf, clf_name in zip(classifiers, classifier_names):
    clf.fit(X_train, y_train)  # Train the classifier
    y_pred = clf.predict(X_test)  # Predict on the test data
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    print(f"{clf_name} Accuracy: {accuracy:.4f}")

Naive Bayes Accuracy: 0.8087
SVM Accuracy: 0.8310
Logistic Regression Accuracy: 0.8245
Random Forest Accuracy: 0.7605


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

num_cluster = 20
kmeans = KMeans(n_clusters=num_cluster, random_state=4)
kmeans.fit(X_train)

kmeans_labels = kmeans.labels_
silhouette_avg = silhouette_score(X_train, kmeans_labels)
print(f"Silhouette score for KMeans clustering: {silhouette_avg:.4f}")



Silhouette score for KMeans clustering: 0.0079


In [None]:
# **Feature Extraction using CountVectorizer**

# Initialize the CountVectorizer
count_vectorizer = CountVectorizer(max_features=10000)

# Fit and transform the training and test data
X_train_count = count_vectorizer.fit_transform(newsgroups_train_preprocessed)
X_test_count = count_vectorizer.transform(newsgroups_test_preprocessed)

# **Model Evaluation using Support Vector Machine (SVM)**

# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear')

# Train the classifier
svm_classifier.fit(X_train_count, y_train)

# Predict on the test data
y_pred_count_svm = svm_classifier.predict(X_test_count)

# Calculate accuracy
accuracy_count_svm = accuracy_score(y_test, y_pred_count_svm)

print(f"SVM Accuracy with CountVectorizer: {accuracy_count_svm:.4f}")


# **Feature Extraction using CountVectorizer**

# Initialize the CountVectorizer
count_vectorizer = CountVectorizer(max_features=10000)

# Fit and transform the training and test data
X_train_count = count_vectorizer.fit_transform(newsgroups_train_preprocessed)
X_test_count = count_vectorizer.transform(newsgroups_test_preprocessed)

# **Model Evaluation using Support Vector Machine (SVM)**

# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear')

# Train the classifier
svm_classifier.fit(X_train_count, y_train)

# Predict on the test data
y_pred_count_svm = svm_classifier.predict(X_test_count)

# Calculate accuracy
accuracy_count_svm = accuracy_score(y_test, y_pred_count_svm)

print(f"SVM Accuracy with CountVectorizer: {accuracy_count_svm:.4f}")
# prompt: NameError: name 'XGBClassifier' is not defined

from xgboost import XGBClassifier

# Initialize the XGBoost classifier
xgb_classifier = XGBClassifier()

# Train the classifier
xgb_classifier.fit(X_train_count, y_train)

# Predict on the test data
y_pred_count_xgb = xgb_classifier.predict(X_test_count)

# Calculate accuracy
accuracy_count_xgb = accuracy_score(y_test, y_pred_count_xgb)

print(f"XGBoost Accuracy with CountVectorizer: {accuracy_count_xgb:.4f}")


SVM Accuracy with CountVectorizer: 0.7321
SVM Accuracy with CountVectorizer: 0.7321
XGBoost Accuracy with CountVectorizer: 0.7623


In [None]:
from sklearn.metrics import silhouette_score

# Calculate and print the silhouette score for the SVM clustering
kmeans_labels_svm = kmeans.fit_predict(X_train_count)
silhouette_avg_svm = silhouette_score(X_train_count, kmeans_labels_svm)
print(f"Silhouette score for SVM clustering: {silhouette_avg_svm:.4f}")



Silhouette score for SVM clustering: 0.5145
