## 20_NewsGroups Text Classification

### Loading all the Files

In [1]:
import os
from pprint import pprint

In [2]:
# A list containing all the Categories
cat = ['alt.atheism','comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware',
       
      'comp.windows.x','misc.forsale','rec.autos','rec.motorcycles','rec.sport.baseball','rec.sport.hockey',
       
      'sci.crypt','sci.electronics','sci.med','sci.space','soc.religion.christian','talk.politics.guns',
       
      'talk.politics.mideast','talk.politics.misc','talk.religion.misc'] 

In [3]:
'''

Splitting all the files into training & testing
For Training 800 Documents from each of the 20 classes are considered.
For Testing the remaining 200 documents from each of the 20 classes are considered.

''' 
current_dir = os.path.join(os.getcwd(),'Dataset')
def get_files(loc):    
    files = []
    # Change the path variable below accordingly
    path = os.path.join(current_dir,loc)

    for r,d,f in os.walk(path):
        for file in f:
            files.append(os.path.join(r, file))
    return files

def get_all_files(cat):
    dict_files_train = {}
    dict_files_test = {}
    for i in cat:
        files = get_files(i)
        dict_files_train[i] = files[:800]
        dict_files_test[i] = files[800:]
        
    return dict_files_train,dict_files_test  

files_train,files_test = get_all_files(cat)

In [4]:
for i in files_train.keys():
    print(i," : ",len(files_train[i]))
    

alt.atheism  :  800
comp.graphics  :  800
comp.os.ms-windows.misc  :  800
comp.sys.ibm.pc.hardware  :  800
comp.sys.mac.hardware  :  800
comp.windows.x  :  800
misc.forsale  :  800
rec.autos  :  800
rec.motorcycles  :  800
rec.sport.baseball  :  800
rec.sport.hockey  :  800
sci.crypt  :  800
sci.electronics  :  800
sci.med  :  800
sci.space  :  800
soc.religion.christian  :  800
talk.politics.guns  :  800
talk.politics.mideast  :  800
talk.politics.misc  :  800
talk.religion.misc  :  800


In [5]:
for i in files_test.keys():
    print(i," : ",len(files_test[i]))

alt.atheism  :  200
comp.graphics  :  200
comp.os.ms-windows.misc  :  200
comp.sys.ibm.pc.hardware  :  200
comp.sys.mac.hardware  :  200
comp.windows.x  :  200
misc.forsale  :  200
rec.autos  :  200
rec.motorcycles  :  200
rec.sport.baseball  :  200
rec.sport.hockey  :  200
sci.crypt  :  200
sci.electronics  :  200
sci.med  :  200
sci.space  :  200
soc.religion.christian  :  197
talk.politics.guns  :  200
talk.politics.mideast  :  200
talk.politics.misc  :  200
talk.religion.misc  :  200


### Preparing Vocabulary

In [6]:
from tqdm import tqdm_notebook
import re
def get_words(f_path):
    '''
     This funtion returns a list of all the words present in the document.
     
     Input : path of file
     Output : list of words 
     
    '''
    f_path = f_path
    with open(f_path,'r',encoding='palmos') as f:
        text = f.read()
    # Replacing Useless Characters with blank spaces    
    text = re.sub('[\$"]','',text)
    
    text = text.replace("\n","")
    text = text.replace("0"," ")
    text = text.replace("1"," ")
    text = text.replace("2"," ")
    text = text.replace("3"," ")
    text = text.replace("4"," ")
    text = text.replace("5"," ")
    text = text.replace("6"," ")
    text = text.replace("7"," ")
    text = text.replace("8"," ")
    text = text.replace("9"," ")
    
    text = text.replace("#"," ")
    text = text.replace("!"," ")
    text = text.replace("`"," ")
    text = text.replace("~"," ")
    text = text.replace("-"," ")
    text = text.replace("/"," ")
    text = text.replace("|"," ")
    text = text.replace(","," ")
    text = text.replace("["," ")
    text = text.replace("]"," ")
    text = text.replace("_"," ")
    text = text.replace("="," ")
    text = text.replace("^"," ")
    text = text.replace("*"," ")
    text = text.replace("+"," ")
    text = text.replace("&"," ")
    text = text.replace(";"," ")
    
    
    text = text.replace("\t","")
    text = text.replace("("," ")
    text = text.replace(")"," ")
    text = text.replace("<"," ")
    text = text.replace(">"," ")
    text = text.replace("?"," ")
    text = text.replace("!"," ")
    text = text.replace(':' ," ")
    text = text.replace('.' ," ")
    
    # Splitting the text by space inorder to get individual word in an array
    words = text.split(" ")
    words = [x.upper() for x in words if x!='']
    
    #Removing Stop Words
    
    from nltk.corpus import stopwords
    
    sw = stopwords.words('english')
    sw = [i.upper() for i in sw]
    words = set(words) - set(sw)
    
    return list(words)

def get_count(words):
    '''
     This function Returns the count of each words present in the document.
     
     Input : list of words
     Output : a dictionary having key as word and value as count of the word
     
    '''
    word_dict = {}
    #Initialising Count of each word to be zero
    for i in set(words):
        word_dict[i] = 0
    for i in words:
        if i in word_dict.keys():
            word_dict[i]+=1
    return word_dict        

def main(files):
    '''
    
      This is the main function that calls all the above functions and builds the Vocabulary.
      
    '''
    all_words = []
    all_words_dict = {}
    
    for i in tqdm_notebook(cat):        
        for j in files[i]:
            words = get_words(j)
            all_words+= words
            
    all_words_dict = get_count(all_words)    
    print("Total Number of Words in Vocabulary: ", len(all_words))    
    return all_words_dict

In [7]:
vocabulary_train = main(files_train)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Total Number of Words in Vocabulary:  2467466


In [8]:
# Considering only those words having count greater than 200
vocab = {}
for i in vocabulary_train:
    if vocabulary_train[i] >= 200:
        vocab[i] = vocabulary_train[i]

In [9]:
vocabulary_sorted_train = [i for i in vocab if len(i)>1]

In [10]:
len(vocabulary_sorted_train)

1697

### Preparing Dataset

In [11]:
import pandas as pd

In [12]:
index_train = range(1,16000+1)
index_test = range(1,3997+1)
# Training Dataset
x_train = pd.DataFrame(index = index_train, columns = vocabulary_sorted_train)
y_train = pd.DataFrame(index = index_train, columns = ['Class'])

# Testing Dataset
x_test = pd.DataFrame(index = index_test, columns = vocabulary_sorted_train)
y_test = pd.DataFrame(index = index_test, columns = ['Class'])

x_train.fillna(0,inplace = True)
x_test.fillna(0,inplace = True)


In [13]:
# Training Dataset

from tqdm import tqdm_notebook
print("Training Data")
f1 = 0
counter1 = 0
for i in files_train:
    f1+=1
    print("Processing Class : ",f1," : ",i)
    classes = i
    file_no = 0
    for file in tqdm_notebook(files_train[classes]):
        file_no+=1
        counter1+=1
        y_train['Class'][counter1] = classes
        words_in_file = get_words(file)
        for word in words_in_file:
            if word in vocabulary_sorted_train:
                x_train[word][counter1]+=1

Training Data
Processing Class :  1  :  alt.atheism


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  2  :  comp.graphics


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  3  :  comp.os.ms-windows.misc


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  4  :  comp.sys.ibm.pc.hardware


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  5  :  comp.sys.mac.hardware


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  6  :  comp.windows.x


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  7  :  misc.forsale


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  8  :  rec.autos


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  9  :  rec.motorcycles


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  10  :  rec.sport.baseball


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  11  :  rec.sport.hockey


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  12  :  sci.crypt


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  13  :  sci.electronics


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  14  :  sci.med


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  15  :  sci.space


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  16  :  soc.religion.christian


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  17  :  talk.politics.guns


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  18  :  talk.politics.mideast


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  19  :  talk.politics.misc


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))


Processing Class :  20  :  talk.religion.misc


HBox(children=(IntProgress(value=0, max=800), HTML(value='')))




In [14]:
# Testing

print("Testing Data")
f2 = 0
counter2 = 0
for i in files_test:
    f2+=1
    print("Processing Class : ",f2," : ",i)
    classes = i
    file_no = 0
    for file in tqdm_notebook(files_test[classes]):
        file_no+=1
        counter2+=1
        y_test['Class'][counter2] = classes
        words_in_file = get_words(file)
        for word in words_in_file:
            if word in vocabulary_sorted_train:
                x_test[word][counter2]+=1


Testing Data
Processing Class :  1  :  alt.atheism


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  2  :  comp.graphics


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  3  :  comp.os.ms-windows.misc


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  4  :  comp.sys.ibm.pc.hardware


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  5  :  comp.sys.mac.hardware


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  6  :  comp.windows.x


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  7  :  misc.forsale


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  8  :  rec.autos


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  9  :  rec.motorcycles


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  10  :  rec.sport.baseball


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  11  :  rec.sport.hockey


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  12  :  sci.crypt


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  13  :  sci.electronics


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  14  :  sci.med


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  15  :  sci.space


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  16  :  soc.religion.christian


HBox(children=(IntProgress(value=0, max=197), HTML(value='')))


Processing Class :  17  :  talk.politics.guns


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  18  :  talk.politics.mideast


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  19  :  talk.politics.misc


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Processing Class :  20  :  talk.religion.misc


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [15]:
# Combining the above Dataframes to get final Training Dataset

dataset_train = pd.concat([x_train,y_train],axis = 1)
dataset_train.columns = list(range(1,len(vocabulary_sorted_train)+1)) + ['Class']

# Combining the above Dataframes to get final Testing Dataset
dataset_test = pd.concat([x_test,y_test],axis = 1)
dataset_test.columns = list(range(1,len(vocabulary_sorted_train)+1)) + ['Class']

In [16]:
# Shuffling the dataset

from sklearn.utils import shuffle
data_train = dataset_train
data_train= shuffle(data_train).reset_index(drop = True)

data_test = dataset_test
data_test = shuffle(data_test).reset_index(drop = True)

In [17]:
# Splitting into X_train,X_test,Y_train,Y_test

X_train = data_train.iloc[:,0:len(vocabulary_sorted_train)]
Y_train = data_train['Class']

X_test = data_test.iloc[:,0:len(vocabulary_sorted_train)]
Y_test = data_test['Class']

## Using Multinomial Naive Bayes Claassifier

In [18]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,Y_train)
y_pred = clf.predict(X_test)

In [20]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

In [28]:
acc = accuracy_score(Y_test,y_pred).round(5)
print('\033[1m' + 'Accuracy: ',acc*100, '%')

[1mAccuracy:  91.093 %


In [22]:
print(classification_report(Y_test,y_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.80      0.84      0.82       200
           comp.graphics       0.84      0.94      0.89       200
 comp.os.ms-windows.misc       0.95      0.96      0.96       200
comp.sys.ibm.pc.hardware       0.94      0.94      0.94       200
   comp.sys.mac.hardware       0.95      0.94      0.94       200
          comp.windows.x       0.95      0.94      0.95       200
            misc.forsale       0.87      0.97      0.92       200
               rec.autos       0.97      0.91      0.94       200
         rec.motorcycles       0.97      0.94      0.95       200
      rec.sport.baseball       0.97      0.96      0.97       200
        rec.sport.hockey       0.97      0.97      0.97       200
               sci.crypt       0.97      0.94      0.96       200
         sci.electronics       0.94      0.94      0.94       200
                 sci.med       0.96      0.94      0.95       200
         

In [25]:
print(confusion_matrix(Y_test,y_pred))

[[169   1   0   0   0   0   0   1   1   0   0   0   0   1   0   1   0   3
    2  21]
 [  0 188   4   1   1   4   0   0   0   0   0   0   0   1   1   0   0   0
    0   0]
 [  0   2 193   0   0   5   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   1   2 189   7   0   0   0   0   0   0   1   0   0   0   0   0   0
    0   0]
 [  0   2   0   8 188   0   1   0   0   0   0   0   0   1   0   0   0   0
    0   0]
 [  0   7   3   0   1 188   0   0   0   0   0   0   0   1   0   0   0   0
    0   0]
 [  0   1   0   1   0   0 195   1   0   0   0   0   0   1   1   0   0   0
    0   0]
 [  0   1   0   0   0   0   8 181   4   1   0   0   4   0   1   0   0   0
    0   0]
 [  0   1   0   0   0   0   8   2 187   0   0   0   0   0   2   0   0   0
    0   0]
 [  0   0   0   0   0   0   0   1   0 193   6   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   0   1   0   4 195   0   0   0   0   0   0   0
    0   0]
 [  0   2   0   0   0   0   2   0   0   0   0 189   3   1   1   0