In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate


categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'misc.forsale']
stratified_split = StratifiedKFold(n_splits=5, shuffle=True)
data = fetch_20newsgroups(subset='train', categories=categories)
vectorizer = CountVectorizer()
data.data = vectorizer.fit_transform(data.data)
data.data.shape
clf = svm.SVC(kernel="linear",C=2)

print("\033[1mLinear SVM Evaluation (CountVect)\033[0m")

# for each training/testing fold
for train_index, test_index in stratified_split.split(data.data, data.target):
    # train (fit) model
    clf.fit(data.data[train_index], data.target[train_index])
    # predict test labels
    clf.predict(data.data[test_index])
    # score the model (using average accuracy for now)
    accuracy = clf.score(data.data[test_index], data.target[test_index])
    print("Accuracy: {:.3}".format(accuracy))
# predict test labels
hyps = clf.predict(data.data[test_index])
refs = data.target[test_index]
scores = cross_validate(clf, data.data, data.target, cv=stratified_split, scoring=['f1_weighted'])   
report = classification_report(refs, hyps, target_names=data.target_names)
    
print(report)
print(sum(scores['test_f1_weighted'])/len(scores['test_f1_weighted']))

[1mLinear SVM Evaluation (CountVect)[0m
Accuracy: 0.916
Accuracy: 0.918
Accuracy: 0.929
Accuracy: 0.914
Accuracy: 0.914
                    precision    recall  f1-score   support

       alt.atheism       0.88      0.92      0.90        96
     comp.graphics       0.90      0.97      0.93       116
      misc.forsale       0.94      0.93      0.94       117
         sci.space       0.96      0.92      0.94       118
talk.religion.misc       0.86      0.80      0.83        76

          accuracy                           0.91       523
         macro avg       0.91      0.91      0.91       523
      weighted avg       0.91      0.91      0.91       523

0.9204894949594895


In [2]:
vectorizer = CountVectorizer(binary=False)
data = fetch_20newsgroups(subset='train', categories=categories)
data.data = vectorizer.fit_transform(data.data)
data.data.shape
clf = svm.SVC(kernel="linear",C=2)

print("\033[1mLinear SVM Evaluation (CountVect-binary)\033[0m")


# for each training/testing fold
for train_index, test_index in stratified_split.split(data.data, data.target):
    # train (fit) model
    clf.fit(data.data[train_index], data.target[train_index])
    # predict test labels
    clf.predict(data.data[test_index])
    # score the model (using average accuracy for now)
    accuracy = clf.score(data.data[test_index], data.target[test_index])
    print("Accuracy: {:.3}".format(accuracy))
# predict test labels
hyps = clf.predict(data.data[test_index])
refs = data.target[test_index]
scores = cross_validate(clf, data.data, data.target, cv=stratified_split, scoring=['f1_weighted'])   
report = classification_report(refs, hyps, target_names=data.target_names)
    
print(report)
print(sum(scores['test_f1_weighted'])/len(scores['test_f1_weighted']))

[1mLinear SVM Evaluation (CountVect-binary)[0m
Accuracy: 0.912
Accuracy: 0.926
Accuracy: 0.924
Accuracy: 0.908
Accuracy: 0.918
                    precision    recall  f1-score   support

       alt.atheism       0.95      0.94      0.94        96
     comp.graphics       0.85      0.97      0.91       116
      misc.forsale       0.95      0.90      0.92       117
         sci.space       0.95      0.91      0.93       118
talk.religion.misc       0.92      0.86      0.88        76

          accuracy                           0.92       523
         macro avg       0.92      0.91      0.92       523
      weighted avg       0.92      0.92      0.92       523

0.9199204170363247


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
data = fetch_20newsgroups(subset='train', categories=categories)
data.data = vectorizer.fit_transform(data.data)
data.data.shape
clf = svm.SVC(kernel="linear",C=2)

print("\033[1mLinear SVM Evaluation (TFIDF)\033[0m")

# for each training/testing fold
for train_index, test_index in stratified_split.split(data.data, data.target):
    # train (fit) model
    clf.fit(data.data[train_index], data.target[train_index])
    # predict test labels
    clf.predict(data.data[test_index])
    # score the model (using average accuracy for now)
    accuracy = clf.score(data.data[test_index], data.target[test_index])
    print("Accuracy: {:.3}".format(accuracy))
# predict test labels
hyps = clf.predict(data.data[test_index])
refs = data.target[test_index]
scores = cross_validate(clf, data.data, data.target, cv=stratified_split, scoring=['f1_weighted'])   
report = classification_report(refs, hyps, target_names=data.target_names)
    
print(report)
print(sum(scores['test_f1_weighted'])/len(scores['test_f1_weighted']))

[1mLinear SVM Evaluation (TFIDF)[0m
Accuracy: 0.956
Accuracy: 0.971
Accuracy: 0.952
Accuracy: 0.952
Accuracy: 0.95
                    precision    recall  f1-score   support

       alt.atheism       0.95      0.97      0.96        96
     comp.graphics       0.94      0.96      0.95       116
      misc.forsale       0.92      0.97      0.94       117
         sci.space       0.98      0.96      0.97       118
talk.religion.misc       0.97      0.88      0.92        76

          accuracy                           0.95       523
         macro avg       0.95      0.95      0.95       523
      weighted avg       0.95      0.95      0.95       523

0.9536599707687479


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(min_df=2)

data = fetch_20newsgroups(subset='train', categories=categories)
data.data = vectorizer.fit_transform(data.data)
data.data.shape
clf = svm.SVC(kernel="linear",C=2)


print("\033[1mLinear SVM Evaluation (TFIDF-minCutOff)\033[0m")

# for each training/testing fold
for train_index, test_index in stratified_split.split(data.data, data.target):
    # train (fit) model
    clf.fit(data.data[train_index], data.target[train_index])
    # predict test labels
    clf.predict(data.data[test_index])
    # score the model (using average accuracy for now)
    accuracy = clf.score(data.data[test_index], data.target[test_index])
    print("Accuracy: {:.3}".format(accuracy))
# predict test labels
hyps = clf.predict(data.data[test_index])
refs = data.target[test_index]
scores = cross_validate(clf, data.data, data.target, cv=stratified_split, scoring=['f1_weighted'])   
report = classification_report(refs, hyps, target_names=data.target_names)
    
print(report)
print(sum(scores['test_f1_weighted'])/len(scores['test_f1_weighted']))


[1mLinear SVM Evaluation (TFIDF-minCutOff)[0m
Accuracy: 0.96
Accuracy: 0.958
Accuracy: 0.966
Accuracy: 0.962
Accuracy: 0.96
                    precision    recall  f1-score   support

       alt.atheism       0.98      0.98      0.98        96
     comp.graphics       0.93      0.95      0.94       116
      misc.forsale       0.96      0.93      0.94       117
         sci.space       0.96      0.97      0.97       118
talk.religion.misc       0.99      0.97      0.98        76

          accuracy                           0.96       523
         macro avg       0.96      0.96      0.96       523
      weighted avg       0.96      0.96      0.96       523

0.9610266705367947


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(max_df=5)

data = fetch_20newsgroups(subset='train', categories=categories)
data.data = vectorizer.fit_transform(data.data)
data.data.shape
clf = svm.SVC(kernel="linear",C=2)

print("\033[1mLinear SVM Evaluation (TFIDF-maxCutOff)\033[0m")


# for each training/testing fold
for train_index, test_index in stratified_split.split(data.data, data.target):
    # train (fit) model
    clf.fit(data.data[train_index], data.target[train_index])
    # predict test labels
    clf.predict(data.data[test_index])
    # score the model (using average accuracy for now)
    accuracy = clf.score(data.data[test_index], data.target[test_index])
    print("Accuracy: {:.3}".format(accuracy))
# predict test labels
hyps = clf.predict(data.data[test_index])
refs = data.target[test_index]
scores = cross_validate(clf, data.data, data.target, cv=stratified_split, scoring=['f1_weighted'])   
report = classification_report(refs, hyps, target_names=data.target_names)
    
print(report)
print(sum(scores['test_f1_weighted'])/len(scores['test_f1_weighted']))

[1mLinear SVM Evaluation (TFIDF-maxCutOff)[0m
Accuracy: 0.8
Accuracy: 0.8
Accuracy: 0.792
Accuracy: 0.782
Accuracy: 0.813
                    precision    recall  f1-score   support

       alt.atheism       0.86      0.82      0.84        96
     comp.graphics       0.87      0.80      0.83       116
      misc.forsale       0.66      0.91      0.76       117
         sci.space       0.88      0.87      0.88       118
talk.religion.misc       0.96      0.58      0.72        76

          accuracy                           0.81       523
         macro avg       0.84      0.80      0.81       523
      weighted avg       0.84      0.81      0.81       523

0.7939525435628617


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words=None)

print("\033[1mLinear SVM Evaluation (TFIDF-WithoutStopWords)\033[0m")


data = fetch_20newsgroups(subset='train', categories=categories)
data.data = vectorizer.fit_transform(data.data)
data.data.shape
clf = svm.SVC(kernel="linear",C=2)

# for each training/testing fold
for train_index, test_index in stratified_split.split(data.data, data.target):
    # train (fit) model
    clf.fit(data.data[train_index], data.target[train_index])
    # predict test labels
    clf.predict(data.data[test_index])
    # score the model (using average accuracy for now)
    accuracy = clf.score(data.data[test_index], data.target[test_index])
    print("Accuracy: {:.3}".format(accuracy))
# predict test labels
hyps = clf.predict(data.data[test_index])
refs = data.target[test_index]
scores = cross_validate(clf, data.data, data.target, cv=stratified_split, scoring=['f1_weighted'])   
report = classification_report(refs, hyps, target_names=data.target_names)
    
print(report)
print(sum(scores['test_f1_weighted'])/len(scores['test_f1_weighted']))

[1mLinear SVM Evaluation (TFIDF-WithoutStopWords)[0m
Accuracy: 0.954
Accuracy: 0.962
Accuracy: 0.969
Accuracy: 0.96
Accuracy: 0.941
                    precision    recall  f1-score   support

       alt.atheism       0.93      0.94      0.93        96
     comp.graphics       0.92      0.93      0.93       116
      misc.forsale       0.94      0.97      0.96       117
         sci.space       0.97      0.93      0.95       118
talk.religion.misc       0.93      0.92      0.93        76

          accuracy                           0.94       523
         macro avg       0.94      0.94      0.94       523
      weighted avg       0.94      0.94      0.94       523

0.9553415583097064


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer



vectorizer = TfidfVectorizer(lowercase=False)

data = fetch_20newsgroups(subset='train', categories=categories)
data.data = vectorizer.fit_transform(data.data)
data.data.shape
clf = svm.SVC(kernel="linear",C=2)

print("\033[1mLinear SVM Evaluation (TFIDF-NoLowerCase)\033[0m")


# for each training/testing fold
for train_index, test_index in stratified_split.split(data.data, data.target):
    # train (fit) model
    clf.fit(data.data[train_index], data.target[train_index])
    # predict test labels
    clf.predict(data.data[test_index])
    # score the model (using average accuracy for now)
    accuracy = clf.score(data.data[test_index], data.target[test_index])
    print("Accuracy: {:.3}".format(accuracy))
# predict test labels
hyps = clf.predict(data.data[test_index])
refs = data.target[test_index]
scores = cross_validate(clf, data.data, data.target, cv=stratified_split, scoring=['f1_weighted'])   
report = classification_report(refs, hyps, target_names=data.target_names)
    
print(report)
print(sum(scores['test_f1_weighted'])/len(scores['test_f1_weighted']))

[1mLinear SVM Evaluation (TFIDF-NoLowerCase)[0m
Accuracy: 0.943
Accuracy: 0.952
Accuracy: 0.95
Accuracy: 0.968
Accuracy: 0.958
                    precision    recall  f1-score   support

       alt.atheism       0.97      0.96      0.96        96
     comp.graphics       0.92      0.98      0.95       116
      misc.forsale       0.97      0.97      0.97       117
         sci.space       0.98      0.95      0.97       118
talk.religion.misc       0.96      0.91      0.93        76

          accuracy                           0.96       523
         macro avg       0.96      0.95      0.96       523
      weighted avg       0.96      0.96      0.96       523

0.9563317654522425
