## Importing modules
- - - - - - - 

In [708]:
import nltk
import os, sys
import string

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from collections import Counter
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB

In [709]:
num_of_features = 4000                  # global variable - the number of words in the vocabulary (feature set)

## Functions
___________________

### fit(X, Y):

  * **X** - X  training  (numpy array, shape = (16000, 4000))
  * **Y** - Y training (numpy array, shape = (16000, 1))
  * It forms a 2 level dictionary using **X, Y**
  * Level 1:
  >  + **current_class**
  >  + **total_docs** = 16000

  * Level 2:     
          
       for each *current_class*
     
       > + 4000(*num_of_features*) keys, each storing the frequency of that word in feature set
       > + key: **total_class_docs** = 800
       > + key: **total_words** = total words (of the vocabulary) in the *current_class*


   * Returns **result** : the dictionary hence built

In [710]:
def fit(X, Y):
    
    result = {}                                                     # initialise dictionary
    result["total_docs"] = 0                                        # total number of documents in training set = 16000
    
    # getting unique class values
    class_values = set()
    for i in range(1, Y.shape[0]):                                  # Y.shape = (16000, 1)
        class_values.add(Y[i][0])

    for current_class in class_values:
        result[current_class] = {}                                  # level 1: class name
       
        # generating T/F array, if the current_class matches or not
        current_class_rows = []
        for i in Y:
            current_class_rows.append(i[0] == current_class)

        X_current = X[current_class_rows]                           # rows in X, where Y == current_class
        Y_current = Y[current_class_rows]                           # rows in Y, where Y == current_class
        
        X_current = np.array(X_current)                             # convert to numpy arrays
        Y_current = np.array(Y_current)

        # Y_current.shape = (800, ) => 800 rows of each class
        
        result["total_docs"]  = result["total_docs"] + Y_current.shape[0]
        result[current_class]["total_class_docs"] = Y_current.shape[0]
        result[current_class]["total_words"] = 0
        
        for j in range(num_of_features):                                
            result[current_class][j] = X_current[:, j].sum()        # level 2: frequency
            result[current_class]["total_words"] += result[current_class][j]
        
    return result

 ### get_common_words(topX, text)
  >  * **topX** -  number of top words to select
  >  * **text** - 
 > + array of strings
 > + from which top words are to be selected
 >  * returns **common_words**: 
 > + array of tuples : (word, frequency)

In [711]:
def get_common_words(topX, text):
    
    freq = nltk.FreqDist(text)                                      # using nltk.FreqDist() to get [(word, freq)..]
    common_words =  freq.most_common(topX) 
    
    return common_words

### get_data(parent_directory):

  > * **parent_directory** -  parent directory containing the sub directiories and data files
  > * traverses the **files** by using the *path* specified
  > * creates a **text** array of strings having all the words in all the files
  > * the file **content** is word tokenized before adding to **text**

In [714]:
def get_data(parent_directory):
    
    dirs = os.listdir(parent_directory)                       #list of sub directories in parent directory
    text = []                                                 # empty list
    
    for current_dir_index in range(1, len(dirs)+1):           # for each sub-directory in the parent directory
        path = '20_newsgroups\\' + dirs[current_dir_index-1]  # finding the path
        files = os.listdir(path)                              # files inside the subdirectory
        
        for file in files[0: 800]:                            # using first 800 files in each subdirectory for training
            path_of_file_in_directory = path + '\\' + file
            ptr = open(path_of_file_in_directory)             # open() returns a pointer
            content = ptr.read()
            text += word_tokenize(content)                    # word-tokenized content
            
    return text

### get_features(most_common_words)
   > * finds the feature set from **most_common_words**
   > * **most_common_words** :  
    + array of tuples: (word, frequency)  
    + sorted in increasing order of frequency
    > * returns **features** : array of strings

In [715]:
def get_features(most_common_words):
    
    features = []                                                       # initialize empty list
    for word in most_common_words:                            
        features.append(word[0])                                        # append the words
        
    return features

### get_simple_pos(tag):
  > * converts the POS **tag** generated by *pos_tag* into the format used by **lemmatize()**
  > * if else statements used

In [716]:
def get_simple_pos(tag):
    
    if tag.startswith('J'):                                            # adjective
        return wordnet.ADJ
    
    elif tag.startswith('N'):                                          # noun
        return wordnet.NOUN 
    
    elif tag.startswith('V'):                                          # verb
        return wordnet.VERB
    
    elif tag.startswith('R'):                                          # adverb
        return wordnet.ADV
    
    else:
        return wordnet.NOUN

### get_stopwords():
  > * returns a list of **stopwords**
  > * uses the inbuilt *English* stopwords and punctuation marks

In [717]:
def get_stopwords():
    
    stops = stopwords.words('English')
    punctuations = string.punctuation                                  
    stops = stops + list(punctuations)                                 # concatenation of lists
    stops = stops + list("'s")
    
    return stops

### lemmatize_text(words):
  > * **lemmatize**s the words using **WordNetLemmatizer()**
  > * **words** : array of words
  > * returns **output_words** : array of lemmatized strings in lowercase

In [718]:
def lemmatize_text(words):
    
    lemmatizer = WordNetLemmatizer()
    output_words = []
    
    for w in words:                                                   # lemmatizes each word one by one
        if w.lower() not in stops:
            pos = pos_tag([w])                                        # find POS
            
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1])) 
            
            output_words.append(clean_word.lower())                   # append the lemmatized strings in lowercase
            
    return output_words

### makeXY(features, parent_directory, stops)
  * **features** - array of words to be used as *features*
  * **parent_directory** - parent directory containing the sub directiories and data files
  * **stops** - list of stopwords and punctuation
  * uses first **800** files in each subdirectory for training
  * returns data in **X, Y** format
  * **X** 
       > + List of lists (4000 elements in each sublist)
       > + each row represents a document
       > + each column entry  denotes the frequency of that *feature* in that *document*
  * **Y**
       > + List   (of 16000 elements)
       > + each row represents the category of the corresponding *X row*
  * uses **collections.Counter()** to find frequency of each word

In [719]:
def makeXY(features, parent_directory, stops):
    
    dirs = os.listdir(parent_directory)                             # list of subdirectories (=classes)
    X = []                                                          # initialise empty lists
    Y = []
    
    for current_dir_index in range(len(dirs)):                      # loop over all subdirectories (20)
        
        path = '20_newsgroups\\' + dirs[current_dir_index]          # path to get inside subdirectory
        files = os.listdir(path)                                    # list of files in subdirectory
        category = dirs[current_dir_index]                          # subdirectory name = category = class
        
        for file in files[0:800]:                                   # first 800 files for training
            text = []                                             
            cnt = Counter()
            
            path_of_file_in_directory = path + '\\' + file          # path to a file
            ptr = open(path_of_file_in_directory)                   # returns a pointer
            content = ptr.read()                                    # read data from file
            
            text = word_tokenize(content)                           # clean the data
            text = remove_stopwords(text, stops)
            
            for word in text:                                       # store frequency of each word in file
                cnt[word] += 1
            
            row = [0 for i in range(num_of_features)]               # row of 4000 zeroes

            for i in range(num_of_features):                        # for each feature in features 
                if features[i] in cnt:                              # if the feature in in file
                    row[i] = cnt[features[i]]                       # store its frequency at corresponding index

            X.append(row)
            Y.append(category)
        
    return X, Y                                                     # lists returned

### makeXY_test(features, parent_directory, stops):
  > * **features** - array of strings to be used as *features*
  > * **parent_directory** - parent directory containing the sub directiories and data files
  > * **stops** -> a list of *stopwords* and punctuation

In [721]:
def makeXY_test(features, parent_directory, stops):

    dirs = os.listdir(parent_directory)                              # list of subdirectories (=classes)
    X = []                                                           # initialise empty lists
    Y = []
    
    for current_dir_index in range(len(dirs)):                       # loop over all subdirectories (20)
        
        path = '20_newsgroups\\' + dirs[current_dir_index]           # path to get inside subdirectory      
        files = os.listdir(path)                                     # list of files in subdirectory
        category = dirs[current_dir_index]                           # subdirectory name = category = class
        
        for file in files[800:]:                                     # files after index 800 used for testing-200 files
            text = []
            cnt = Counter()
            
            path_of_file_in_directory = path + '\\' + file           # path to a file
            ptr = open(path_of_file_in_directory)                    # returns a pointer 
            content = ptr.read()                                     # read data from file
            
            text = word_tokenize(content)                            # clean the data
            text = remove_stopwords(text, stops)
            
            for word in text:                                        # store frequency of each word in file
                cnt[word] += 1
            
            row = [0 for i in range(num_of_features)]                # row of 4000 zeroes
            
            for i in range(num_of_features):                         # for each feature in features 
                if features[i] in cnt:                               # if the feature is in file
                    row[i] = cnt[features[i]]                        # store its frequency at corresponding index

            X.append(row)
            Y.append(category)
        
    return X, Y                                                      # lists returned

### plot_graph(common_words):
  > * **common_words** - array of tuples after finding the most_common_words
  > * used to find the optimal number of features to choose
  > * plots the graphs between word (x-axis) and its frequency (y-axis)
  
  + **Note** - not used in this project, the number of features best suited were 4000

In [722]:
def plot_graph(common_words):
    
    x = []
    y = []
    for i in common_words:
        x.append(i[0])
        y.append(i[1])
    plt.scatter(x, y)

### def probability(dictionary, x, current_class):
  > * **dictionary** - dictionary built by **fit(X, Y)** 
  > * **x**  
 + row in **X_test** whose probability is to be calculated
 + represents a testing document
  > * **current_class** 
 + 1 class out of 20 classes
 + for which the probability that **x** belongs to **current_class** is calculated
  > * *log*  is used to nullify the effect of very small probabilities getting multiplied resulting in even smaller probabilities
  > * returns **prob** : the calculated probability 
  > * **Naive Bayes**

In [723]:
def probability(dictionary, x, current_class):
    
    # documents of current_class / total documents in training set
    prob = np.log(dictionary[current_class]["total_class_docs"]) - np.log(dictionary["total_docs"])
    
    for feature_index in range(num_of_features):                     # iterate over all indices: range(4000)
        
        x_count = x[feature_index]                                   # count of feature[feature_index] in x
         
        if(x_count!=0):                                              # if the word is present in x
            
            # laplace correction also done
            # count of word in current_class /  total number of words in current_class
            num = dictionary[current_class][feature_index] + 1
            den = dictionary[current_class]["total_words"] + num_of_features
            
            current_p = np.log(num) - np.log(den)                    # take log
            current_p = current_p * x_count                          # multiply with frequency of word in x
            prob = prob + current_p                                  # adding log values [log(a*b) = log(a) + log(b)]

    return prob                                                 

### def predict_single_point(dictionary, x):
  > * **dictionary** - dictionary built by **fit(X, Y)**
  > * **x** - row in **X_test** whose probability is to be calculated;  represents a testing document
  > * out of 20 possible *classes*, finds the **best_class** predicted for **x**, with probability **best_p**
  > * returns **best_class** : best class prediction

In [724]:
def predict_single_point(dictionary, x):
    
    classes = dictionary.keys()                                       # all classes (20)
    best_p = -100000                                                  # initialisation
    best_class = ''
    first_run = True                                                  # for checking
     
    for current_class in classes:                                     # loop over all classes
        
        if current_class == "total_docs":
            continue
        p_current_class = probability(dictionary, x, current_class)   # find prob that x belongs to current_class
        
        if first_run or p_current_class > best_p:                     # update best_p, if higher prob found
            best_p = p_current_class
            best_class = current_class
            
        first_run = False
        
    return best_class                                                 # best predicted class according to probabilites

### def predict(dictionary, X_test):
  > * **dictionary** - dictionary built using **fit(X, Y)**
  > * **X_test** -
  + entire testing data
  + numPy array
  + *row* represents a document
  + *col* represents the feature frequency
  > * returns **Y_pred** : list of predicted classes

In [725]:
def predict(dictionary, X_test):
    
    Y_pred = []                                                      # initialise empty list
    for x in X_test:                                                 # loop over all rows (=documents) in X_test
        x_class = predict_single_point(dictionary, x)
        Y_pred.append(x_class)                                       # append the predicted class for each x
        
    return Y_pred

### remove_stopwords(text, stops):
  > * removes the *stopwords* from a given **text**
  > * **text** - data *list* from which *stopwords* are to be removed
  > * **stops** - list of all stopwords and punctuation marks
  > * returns **clean_text**

In [726]:
def remove_stopwords(text, stops):
    clean_text = [w for w in text if w not in stops]         # contains words which are not in stopwords
    return clean_text

-----------------------------

# Main code

---------------------------

 > reading and cleaning the entire data

In [727]:
parent_directory = '20_newsgroups'                           # contains all subdirectories and data files
text = get_data(parent_directory)                            # array of words(from all data files)

print(len(text))                                             # verification

7110266


In [728]:
stops = get_stopwords()                                      # list of stopwords and punctuation

In [729]:
clean_text = remove_stopwords(text, stops)                   # array of words with stopwords removed

print(len(clean_text))                                       # verification

3830370


In [730]:
text = clean_text                                            # copy the cleaned data back to "text"

In [731]:
clean_text = []                                              # initialize a list for storing lemmatized words 
clean_text = lemmatize_text(text)                            # returns the lemmatized words

In [732]:
len(clean_text)                                              # length reduces further

3555248

> extracting the top 4000 features

In [733]:
most_common_words = get_common_words(num_of_features, clean_text)              # 4000 most common words needed
# tuple returned - most_common_words is a tuple (word, freq)

In [734]:
features = get_features(most_common_words)                   # extraction of features (vocabulary)
print(features)



In [735]:
print(len(features))                                         # verification : num_of_features = 4000

4000


In [736]:
classes = os.listdir(parent_directory)                       # the 20 categories (output)
classes

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

> * getting the cleaned text to **X, Y** format, as used by **sklearn** classifiers
> * generating **X_train, X_test, Y_train, Y_test**

In [737]:
Xtrain, Ytrain = makeXY(features, parent_directory, stops)                     # List of lists for training data

In [738]:
Xtest, Ytest = makeXY_test(features, parent_directory, stops)                  # List of lists for testing data

In [739]:
# converting to NumPy arrays

X_train = np.array(Xtrain)                                   # X_train.shape = (16000, 4000)
Y_train = np.array(Ytrain)                                   # Y_train.shape = (16000, )

In [740]:
X_test = np.array(Xtest)                                     # X_test.shape = (3997, 4000)
Y_test = np.array(Ytest)                                     # Y_test.shape = (3997, )

In [741]:
# reshaping to get 1 column                       

Y_train = Y_train.reshape(-1, 1)                             # Y_train.shape = (16000, 1)
Y_test = Y_test.reshape(-1, 1)                               # Y_test.shape = (3997, 1)

In [742]:
# verification
                  
print(Y_train.shape)
print(Y_test.shape)  
print("___________")

print(X_train.shape)
print(X_test.shape)
print("___________")

print(X_train[1])
print(X_test[1])
print("___________")

print(X_train[1].shape)
print(X_test[1].shape)
print("___________")


(16000, 1)
(3997, 1)
___________
(16000, 4000)
(3997, 4000)
___________
[26 97 91 ...,  0  0  0]
[4 0 0 ..., 0 0 0]
___________
(4000,)
(4000,)
___________


 > using the inbuilt **MultinomialNB()** classifier from **sklearn.naive_bayes**

In [743]:
clf = MultinomialNB()                                        # creation of object

In [744]:
clf.fit(X_train, Y_train)                                    # fit training data

  y = column_or_1d(y, warn=True)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [748]:
Y_train_pred = clf.predict(X_train)                          # predict on training data        
Y_train_pred = Y_train_pred.reshape(-1, 1)

print(Y_train_pred.shape)                                    # verification
print(Y_train.shape)

print(clf.score(X_train, Y_train))                           # score
print(accuracy_score(Y_train, Y_train_pred))

(16000, 1)
(16000, 1)
0.8843125
0.8843125


In [749]:
Y_test_pred = clf.predict(X_test)                            # predict on testing data
Y_test_pred = Y_test_pred.reshape(-1, 1)   

print(Y_test_pred.shape)                                     # verification
print(Y_test.shape)

print(clf.score(X_test, Y_test))                             # score
print(accuracy_score(Y_test, Y_test_pred))

(3997, 1)
(3997, 1)
0.795346509882
0.795346509882


> using the code written on my own

In [750]:
# fitting training data

dictionary = fit(X_train, Y_train)                           # dictionary of frequencies

In [751]:
dictionary["total_docs"]                                     # verification
dictionary[classes[4]]["total_class_docs"]

800

In [752]:
Y_test_pred = predict(dictionary, X_test)                    # predict on testing data

In [753]:
print(classification_report(Y_test, Y_test_pred))            # performance
print(confusion_matrix(Y_test, Y_test_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.70      0.68      0.69       200
           comp.graphics       0.65      0.72      0.69       200
 comp.os.ms-windows.misc       0.84      0.69      0.75       200
comp.sys.ibm.pc.hardware       0.74      0.78      0.76       200
   comp.sys.mac.hardware       0.80      0.81      0.81       200
          comp.windows.x       0.61      0.68      0.64       200
            misc.forsale       0.74      0.89      0.81       200
               rec.autos       0.82      0.92      0.87       200
         rec.motorcycles       0.91      0.87      0.89       200
      rec.sport.baseball       0.88      0.88      0.88       200
        rec.sport.hockey       0.95      0.84      0.90       200
               sci.crypt       0.95      0.88      0.91       200
         sci.electronics       0.81      0.77      0.79       200
                 sci.med       0.98      0.85      0.91       200
         

In [754]:
print(accuracy_score(Y_test_pred, Y_test))                   # score

0.795346509882
