In [1]:
#import below modules
from os import listdir
import numpy as np
import pandas as pd
import string
import codecs

In [2]:
#download and extract the files from given link
import urllib.request
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/20_newsgroups.tar.gz", "a.tar.gz")
import tarfile
tar = tarfile.open("a.tar.gz")
tar.extractall()
tar.close()

In [3]:
#make a list of all the file path and their corresponding classes
f_paths=[]
i=-1
path="20_newsgroups"
folderlist=listdir(path)

In [4]:
if ".DS_Store" in folderlist:
    folderlist.remove('.DS_Store')
for folder in folderlist:
    i+=1
    filelist=listdir(path+'/'+folder)
    for file in filelist:
        f_paths.append((path+'/'+folder+'/'+file,i))
len(f_paths)

19997

In [5]:
#split the file paths into training and testing data
from sklearn import model_selection
x_train, x_test = model_selection.train_test_split(f_paths)
len(x_train), len(x_test)

(14997, 5000)

In [6]:
#make the list X_train and X_test that contains the file path for training and testing data respectively
X_train=[]
X_test=[]
Y_train=[]
Y_test=[]
for i in range(len(x_train)):
    X_train.append(x_train[i][0])
    Y_train.append(x_train[i][1])
for i in range(len(x_test)):
    X_test.append(x_test[i][0])
    Y_test.append(x_test[i][1])
    
#convert Y_train and Y_test into one dimensional np array
Y_train=(np.array([Y_train])).reshape(-1)
Y_test=(np.array([Y_test])).reshape(-1)

In [7]:
#size of Y_train and Y_test np arrays
Y_train.shape, Y_test.shape

((14997,), (5000,))

In [8]:
#import module to download stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop=set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
#adding all the lists including punctuations to stopwords
stop_words=list(stop)+list(set(string.punctuation))
len(stop_words)

211

In [10]:
#making vocabulary from the X_train file i.e Training Data
vocab={}
count=0
for filename in X_train:
    count+=1
    f = open(filename, 'r', errors='ignore')
    record=f.read()
    words=record.split()
    for word in words:
        if len(word)>2:
            if word.lower() not in stop_words:
                if word.lower() in vocab:
                    vocab[word.lower()]+=1
                else:
                    vocab[word.lower()]=1
    f.close()

In [11]:
#length of the vocabulary
len(vocab)

352801

In [12]:
import operator

In [13]:
#sort the vocabulary on the basis of frequency of the words
sorted_vocab = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True)

In [14]:
#make a list feature_names containing top 2000 words with their frequency
feature_names=[]
for i in range (len(sorted_vocab)):
    if(sorted_vocab[2000][1]<=sorted_vocab[i][1]):
        feature_names.append(sorted_vocab[i][0])

In [15]:
#number of features
print(len(feature_names))

2010


In [16]:
#make dataframes df_train and df_test with columns having the feature names
df_train = pd.DataFrame(columns=feature_names)
df_test = pd.DataFrame(columns=feature_names)

In [17]:
count_train, count_test = 0,0

#transform each file in X_train into a row in the dataframe df_train 
#having columns as feature names and values as the frequency of that word
for filename in X_train:
    count_train+=1
    df_train.loc[len(df_train)] = np.zeros(len(feature_names))
    f = open(filename, 'r', errors='ignore')
    record=f.read()
    words=record.split()
    for word in words:
        if word.lower() in df_train.columns:
            df_train[word.lower()][len(df_train)-1]+=1
    f.close()
    
#transform each file in X_test into a row in the dataframe df_train 
#having columns as feature names and values as the frequency of that word    
for filename in X_test:
    count_test+=1
    df_test.loc[len(df_test)] = np.zeros(len(feature_names))
    f = open(filename, 'r', errors='ignore')
    record=f.read()
    words=record.split()
    for word in words:
        if word.lower() in df_test.columns:
            df_test[word.lower()][len(df_test)-1]+=1
    f.close()
    
#print the number files that has been transformed into training and testing data
print(count_train, count_test)

14997 5000


In [19]:
#put the values of df_train and df_test into X_train and X_test
X_train = df_train.values
X_test = df_test.values

# Use the Inbuilt Multinomial classifier from sklearn

In [21]:
#Use MultiNomial classifier from sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
clf=MultinomialNB()
clf.fit(X_train, Y_train)
Y_pred=clf.predict(X_test)
#print classification report
print(classification_report(Y_test, Y_pred))
#print testng score
print("Testing: ", clf.score(X_test, Y_test))

              precision    recall  f1-score   support

           0       0.70      0.74      0.72       268
           1       0.81      0.78      0.80       262
           2       0.88      0.83      0.86       235
           3       0.84      0.85      0.85       245
           4       0.90      0.91      0.90       254
           5       0.91      0.86      0.89       264
           6       0.63      0.92      0.74       225
           7       0.81      0.90      0.85       242
           8       0.83      0.95      0.88       251
           9       0.96      0.95      0.95       272
          10       0.98      0.93      0.95       256
          11       0.95      0.88      0.91       226
          12       0.78      0.87      0.82       242
          13       0.94      0.81      0.87       275
          14       0.93      0.90      0.91       270
          15       0.95      0.98      0.97       249
          16       0.75      0.81      0.78       256
          17       0.96    