In [13]:
# import required libraries 
import nltk
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('punkt')

import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# create functions for feature engineering of the data for each of the models
def tokenize(text):
    return word_tokenize(text)

def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    return([token.lower() for token in text if token not in stop_words])

def stem_words(text):
    stemmer = WordNetLemmatizer()
    return ([stemmer.lemmatize(token) for token in text])

def stem_words_more(text):
    stemmer = PorterStemmer()
    return ([stemmer.stem(token) for token in text])

def remove_puncts(text):
    puncts = "~`!@#€$%^&*()_-+={[}]|\:;'<,>.?/"
    return ([char for char in text if char not in puncts])

def clean_data(input_list):
    return_list = []
    for li in input_list:
        return_list.append(stem_words(remove_puncts(remove_stop_words(tokenize(li)))))
    return return_list

def dummy(doc):
    return doc

## SVM

In [5]:
# import required libraries
from sklearn.svm import LinearSVC

# import data
news = fetch_20newsgroups(subset='all')

# set the x and y values
X,y = news.data, news.target

# create train, test and split for x and y data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=32)

# clean the data
x_train = clean_data(x_train[0:6000])
x_test = clean_data(x_test[0:400])

y_train = y_train[0:6000]
y_test= y_test[0:400]

# feature engineering to compute the importance of each word
tfidf = TfidfVectorizer(analyzer='word', tokenizer=dummy, preprocessor=dummy, token_pattern=None)

# add tfidf to x train and test data
x_train = tfidf.fit_transform(x_train)
x_test = tfidf.transform(x_test)

# create our svm classification model and fit the training data
svm = LinearSVC(C=100, loss='hinge', max_iter=100000)
svm.fit(x_train, y_train)

# svm validation
print(svm.score(x_test,y_test))

y_pred = svm.predict(x_test)

print(classification_report(y_test, y_pred))


cm = pd.DataFrame(confusion_matrix(y_test, y_pred))
cm

0.9
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        25
           1       0.61      0.67      0.64        21
           2       0.86      0.95      0.90        20
           3       0.83      0.71      0.77        14
           4       0.86      0.86      0.86        21
           5       0.95      0.90      0.93        21
           6       0.79      0.92      0.85        12
           7       1.00      1.00      1.00        14
           8       1.00      0.96      0.98        24
           9       0.96      0.92      0.94        24
          10       0.93      1.00      0.96        26
          11       1.00      0.85      0.92        26
          12       0.71      1.00      0.83        12
          13       0.85      0.89      0.87        19
          14       0.95      0.90      0.93        21
          15       0.91      0.91      0.91        22
          16       0.92      0.92      0.92        24
          17       0.94

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,14,1,1,2,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0
2,0,1,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,10,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0
4,0,2,0,0,18,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,1,1,0,0,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,23,0,0,0,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,22,1,0,0,0,0,1,0,0,0,0


## Naive Bayes

In [6]:
# import required libraries
from sklearn.naive_bayes import MultinomialNB

# import data
news = fetch_20newsgroups(subset='all')

# set x and y values
X,y = news.data, news.target

# creat train, test and split for x and y data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=32)

# clean training and test data
x_train = clean_data(x_train[0:6000])
x_test = clean_data(x_test[0:200])

y_train = y_train[0:6000]
y_test= y_test[0:200]


# feature engineering to compute the importance of each word
tfidf = TfidfVectorizer(analyzer='word', tokenizer=dummy, preprocessor=dummy, token_pattern=None)

# apply tfidf to data
x_train = tfidf.fit_transform(x_train)
x_test = tfidf.transform(x_test)

# create Naive Bayes model and fit the data
nb_clf =  MultinomialNB(alpha=0.005)
nb_clf.fit(x_train, y_train)

# Naive Bayes vaidation
print(nb_clf.score(x_test, y_test))
y_pred = nb_clf.predict(x_test)

print(classification_report(y_test, y_pred))
print(nb_clf.predict(x_test[0]))

cm = pd.DataFrame(confusion_matrix(y_test, y_pred))
cm


0.935
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       0.88      0.64      0.74        11
           2       0.71      0.91      0.80        11
           3       1.00      0.83      0.91         6
           4       1.00      0.91      0.95        11
           5       0.86      1.00      0.92        12
           6       0.88      1.00      0.93         7
           7       1.00      1.00      1.00         5
           8       0.93      1.00      0.96        13
           9       1.00      1.00      1.00        13
          10       1.00      1.00      1.00        14
          11       1.00      0.83      0.91        18
          12       0.80      1.00      0.89         8
          13       1.00      0.91      0.95        11
          14       1.00      1.00      1.00        12
          15       1.00      1.00      1.00        11
          16       1.00      1.00      1.00         7
          17       0.

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,7,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,10,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,1,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0,0,0,0,0


## Neural Network

In [18]:
# import required libraries
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#import data
news = fetch_20newsgroups(subset='all')

# set our x and y values
X,y = news.data, news.target

# clean train, test and split for x and y data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=32)

# clean the data
x_train = clean_data(x_train[0:6000])
x_test = clean_data(x_test[0:200])

y_train = y_train[0:6000]
y_test= y_test[0:200]

# feature engineering to compute the importance of each word
tfidf = TfidfVectorizer(analyzer='word', tokenizer=dummy, preprocessor=dummy, token_pattern=None)

x_train = tfidf.fit_transform(x_train)
x_test = tfidf.transform(x_test)

# crate neural netowk classifier and fit the training data
nn_class = MLPClassifier(max_iter=1000000)
nn_class.fit(x_train, y_train)

# neural network validation
print(nn_class.score(x_test,y_test))
y_pred = nn_class.predict(x_test)
print(classification_report(y_test, y_pred))

cm = pd.DataFrame(confusion_matrix(y_test, y_pred))
cm

0.925
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       0.90      0.82      0.86        11
           2       0.69      1.00      0.81        11
           3       0.83      0.83      0.83         6
           4       1.00      0.82      0.90        11
           5       1.00      0.92      0.96        12
           6       0.67      0.86      0.75         7
           7       1.00      1.00      1.00         5
           8       1.00      1.00      1.00        13
           9       1.00      1.00      1.00        13
          10       1.00      1.00      1.00        14
          11       1.00      0.83      0.91        18
          12       0.88      0.88      0.88         8
          13       0.91      0.91      0.91        11
          14       1.00      1.00      1.00        12
          15       1.00      0.91      0.95        11
          16       1.00      0.86      0.92         7
          17       0.

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,9,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,9,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0,0,0,0,0
