In [3]:
import numpy as np
# Import text datasets from scikit learn
# Q1a). and Q1b).
from sklearn.datasets import fetch_20newsgroups
# Newsgroups to access
categories = ['alt.atheism','talk.religion.misc']
# Get the train subset of 
data_train = fetch_20newsgroups(subset='train', categories=categories,
shuffle=True, random_state=30027)
data_test = fetch_20newsgroups(subset='test', categories=categories,
shuffle=True, random_state=30027)
X_train = data_train.data
y_train = data_train.target
X_test = data_test.data
y_test = data_test.target

<class 'list'>


### Q1c
It is not possible to determine the class from inspecting the text alone especially using human eyes.

In [13]:
# Q2a). & Q2b).
from sklearn.feature_extraction.text import CountVectorizer
vectoriser = CountVectorizer()
X_train_cv = vectoriser.fit_transform(X_train)
X_test_cv = vectoriser.transform(X_test)


In [14]:
# Q2c).
print("Documents is: " + str(np.shape(X_train_cv)[0]) + "\n" + "Number of unique words is: " + str(np.shape(X_train_cv)[1]))
print("Documents is: " + str(np.shape(X_test_cv)[0]) + "\n" + "Number of unique words is: " + str(np.shape(X_test_cv)[1]))

Documents is: 857
Number of unique words is: 18089
Documents is: 570
Number of unique words is: 18089


### Q2d).
No there are no documents in X_test whose values are all 0  
Also there is no way a document would have values which are all 0 unless a prespecified vocabulary of words
is given to the vectoriser. Since a document with values that are all 0 means that the document contains all 
new unique words that were not present in the a-priori dictionary.



In [15]:
from sklearn.feature_selection import SelectKBest, chi2
x2 = SelectKBest(chi2, k=10)
x2.fit(X_train_cv, y_train)
# these two statements can be combined into
X_train_x2 = x2.transform(X_train_cv) # a single statement via fit_transform()
X_test_x2 = x2.transform(X_test_cv)

In [16]:
# Q3a).
print("For X_train: " + str(np.shape(X_train_x2)))
print("For X_test: " + str(np.shape(X_test_x2)))

# So the shape of the 2 sets remain the same in terms of the number of samples (i.e. documents) but the number of 
# features that they have (i.e. The unique words) has been reduced to the top 10 according to the scoring of
# Chi-squared.

For X_train: (857, 10)
For X_test: (570, 10)


In [17]:
# Q3b).
for feat_num in x2.get_support(indices=True):
    print(vectoriser.get_feature_names()[feat_num])
    
# Looks kinda right since we have selected documents relating to religion. But it does seem like it relates to the bias in Chi-Squared. 
# In the context of these documents, it seems that the top 10 best features appear to be random numbers that rarely appear throughout 
# all of the documents sampled but will always frequently appear with the same given class.

atheism
atheist
atheists
brian
caltech
christ
islamic
jesus
keith
ra


In [18]:
# Q3c).
from sklearn.feature_selection import mutual_info_classif
mi = SelectKBest(mutual_info_classif, k=10)
mi.fit(X_train_cv,y_train)
X_train_mi = mi.transform(X_train_cv)
X_test_mi = mi.transform(X_test_cv)

# Print out top 10 features calculated by MI
for feat_num in mi.get_support(indices=True):
    print(vectoriser.get_feature_names()[feat_num])
    
# This one would seem more intuitive as MI would bias features that appear commonly but may not be informative
# as to which class they actually refer to.

allan
atheism
atheists
caltech
cco
it
keith
of
schneider
the


In [51]:
# Q4a). Building a classifier of the text using K-NN and possibly NB or Decision trees
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

# K neighbours
k = 5
knn = KNeighborsClassifier(n_neighbors=k, algorithm='brute')
knn.fit(X_train_cv, y_train)
print("{}-NN Accuracy = ".format(k) + str(knn.score(X_test_cv, y_test)))

# Decision Tree
dt = DecisionTreeClassifier(max_depth=None)
dt.fit(X_train_cv, y_train)
print("DT Accuracy = " + str(dt.score(X_test_cv, y_test)))

nb = MultinomialNB(alpha=1.0)
nb.fit(X_train_cv, y_train)
print("NB Accuracy = " + str(nb.score(X_test_cv, y_test)))

# Clearly Naive Bayes predicts this data the best which might be due to the inherent fact that frequencies
# and probabilities when trying to classify textual information is a good approach. Decision tress have slightly
# less accuracy but K-Nearest Neighbours performs very mediocre regardless of the value of K

5-NN Accuracy = 0.6578947368421053
DT Accuracy = 0.8087719298245614
NB Accuracy = 0.8456140350877193


In [53]:
# Q4b). Using the top 10 features
k = 25
knn = KNeighborsClassifier(n_neighbors=k, algorithm='brute')
knn.fit(X_train_mi, y_train)
print("{}-NN Accuracy = ".format(k) + str(knn.score(X_test_mi, y_test)))

# Decision Tree
dt = DecisionTreeClassifier(max_depth=None)
dt.fit(X_train_mi, y_train)
print("DT Accuracy = " + str(dt.score(X_test_mi, y_test)))

nb = MultinomialNB(alpha=1.0)
nb.fit(X_train_mi, y_train)
print("NB Accuracy = " + str(nb.score(X_test_mi, y_test)))

# Here the accuracy drops dramatically. With the effectiveness of each of the 3 classifiers becoming more 
# and more similar to each other

25-NN Accuracy = 0.5491228070175439
DT Accuracy = 0.5877192982456141
NB Accuracy = 0.6140350877192983


In [58]:
# Q4c). Adjusting K best cutoff and checking between MI and Chi-squared
cutoff = 15000

mi = SelectKBest(mutual_info_classif, k=cutoff)
mi.fit(X_train_cv,y_train)
X_train_mi = mi.transform(X_train_cv)
X_test_mi = mi.transform(X_test_cv)

print("Accuracies for Mutual Information with cutoff = {}".format(cutoff))

k = 25
knn = KNeighborsClassifier(n_neighbors=k, algorithm='brute')
knn.fit(X_train_mi, y_train)
print("{}-NN Accuracy = ".format(k) + str(knn.score(X_test_mi, y_test)))

# Decision Tree
dt = DecisionTreeClassifier(max_depth=None)
dt.fit(X_train_mi, y_train)
print("DT Accuracy = " + str(dt.score(X_test_mi, y_test)))

nb = MultinomialNB(alpha=1.0)
nb.fit(X_train_mi, y_train)
print("NB Accuracy = " + str(nb.score(X_test_mi, y_test)))

print()

# For Chi-squared
x2 = SelectKBest(chi2, k=cutoff)
x2.fit(X_train_cv,y_train)
X_train_x2 = x2.transform(X_train_cv)
X_test_x2 = x2.transform(X_test_cv)

print("Accuracies for Chi-Squared with cutoff = {}".format(cutoff))

k = 25
knn = KNeighborsClassifier(n_neighbors=k, algorithm='brute')
knn.fit(X_train_x2, y_train)
print("{}-NN Accuracy = ".format(k) + str(knn.score(X_test_x2, y_test)))

# Decision Tree
dt = DecisionTreeClassifier(max_depth=None)
dt.fit(X_train_x2, y_train)
print("DT Accuracy = " + str(dt.score(X_test_x2, y_test)))

nb = MultinomialNB(alpha=1.0)
nb.fit(X_train_x2, y_train)
print("NB Accuracy = " + str(nb.score(X_test_x2, y_test)))

# No, it is not possible to improve upon the accuracies of all the models. The best accuracy obtained from all 
# models will be those that are trained using all of the available features(attributes). Observations show
# that as the cutoff of SelectKBest increases so does the accuracy of all 3 models with some showing larger
# improvement gains than others.

# Also the choice between Chi-squared and Mutual Information seems to have a negligible influence on the
# accuracy of all 3 models. For KNN, chi-squared seems to give a slightly better accuracy whereas for the 
# other 2 models, the choice of error function does not seem to alter accuracy all that much with only
# slight differences

Accuracies for Mutual Information with cutoff = 15000
25-NN Accuracy = 0.656140350877193
DT Accuracy = 0.7982456140350878
NB Accuracy = 0.8456140350877193

Accuracies for Chi-Squared with cutoff = 15000
25-NN Accuracy = 0.6736842105263158
DT Accuracy = 0.7894736842105263
NB Accuracy = 0.843859649122807
