In [127]:
#importing neccesary files
import nltk
import io
import re
import zipfile
import math
from tqdm import tqdm
import pickle as pkl
import numpy as np
from random import shuffle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from collections import Counter
import string
nltk.download('stopwords')
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/frostrot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Loading dataset

In [36]:
archive = zipfile.ZipFile('../20_newsgroups.zip', 'r')
data = []
error_files = []
file_list = archive.namelist()

for filename in file_list:
    try:
        if str(filename).split('/')[1] in ['comp.graphics','sci.med','talk.politics.misc','rec.sport.hockey','sci.space'] and str(filename).split('/')[2]!='':
            with archive.open(filename,'r') as f:
                name = str(filename).split('/')[1]+"/"+str(filename).split('/')[2]
                textlist = []
                for line in io.TextIOWrapper(f,'latin-1'):
                    textlist.append(line)
                content = " ".join(textlist)
                data.append({'file':name,'content':content})
    except Exception as e:
        print(e)
        error_files.append(str(filename))

print(f"Data collected from {len(data)} files")
print(f"{len(error_files)} files had error")

Data collected from 5000 files
0 files had error


### Filtering

In [25]:
#Remove emojis from the text, if any present

def remove_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF" 
        u"\U0001F1E0-\U0001F1FF"  
        u"\U0001F1F2-\U0001F1F4"  
        u"\U0001F1E6-\U0001F1FF" 
        u"\U0001F600-\U0001F64F"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U0001F1F2"
        u"\U0001F1F4"
        u"\U0001F620"
        u"\u200d"
        u"\u2640-\u2642"
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

In [20]:
def stemming(text):
    x = text.split(" ")
    ps = PorterStemmer()
    return " ".join([ps.stem(i) for i in x])

In [23]:
#Remove all the article connector present in the text

def remove_art_connector(text):
  article = ["CAN","IS","HIS","MORE","WHO","ABOUT","THEIR","OUR","HAS","WHO","GET","THEM","WHAT","OUT","FROM","HAVE","HERE","WE","ALL","THERE","TO","ALSO","AND","AS","BUT","YET","YOU","THE","WAS","FOR","ARE","THEY","THIS","THAT","WERE","WITH","YOUR","JUST","WILL","NOT"]
  ans=[]
  for word in text:
    if word.strip() not in article:
      ans.append(word)
  return ans		

In [24]:
#Remove Punctuations from the text

def remove_punc(tokens):
  table = string.punctuation
  ptokens = []
  for w in tokens:
    if w not in table:
      ptokens.append(w)
  ptokens = [s for s in ptokens if s]
  ptokens = [re.sub(r"[\n\t]+"," ",s) for s in ptokens]
  return ptokens

In [21]:
#Remove stopwords, and convert shorted words into there extended forms

def stopword(text):
  EXTENDED_FORMS = {"aren't": 'are not', "can't": 'cannot', "couldn't": 'could not', "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hasn't": 'has not', "haven't": 'have not', "he'd": 'he would', "he'll": 'he will', "he's": 'he is', "i'd": 'i would', "i'll": 'i will', "i'm": 'i am', "isn't": 'is not', "it's": 'it is', "it'll": 'it will', "i've": 'i have', "let's": 'let us', "mightn't": 'might not', "mustn't": 'must not',"n't": 'not', "shan't": 'shall not', "she'd": 'she would', "she'll": 'she will', "she's": 'she is', "shouldn't": 'should not', "that's": 'that is', "there's": 'there is', "they'd": 'they would', "they'll": 'they will', "they're": 'they are', "they've": 'they have', "we'd": 'we would', "we're": 'we are', "weren't": 'were not', "we've": 'we have', "what'll": 'what will', "what're": 'what are', "what's": 'what is', "what've": 'what have', "where's": 'where is', "who'd": 'who would', "who'll": 'who will', "who're": 'who are', "who's": 'who is', "who've": 'who have', "won't": 'will not', "wouldn't": 'would not', "you'd": 'you would', "you'll": 'you will', "you're": 'you are', "you've": 'you have', "'re": ' are', "wasn't": 'was not', "we'll": 'we will', "'cause": 'because', "could've": 'could have', "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', "I'd": 'I would', "I'd've": 'I would have', "I'll": 'I will', "I'll've": 'I will have', "I'm": 'I am', "I've": 'I have', "i'd've": 'i would have', "i'll've": 'i will have', "it'd": 'it would', "it'd've": 'it would have', "it'll've": 'it will have', "ma'am": 'madam', "mayn't": 'may not', "might've": 'might have', "mightn't've": 'might not have', "must've": 'must have', "mustn't've": 'must not have', "needn't": 'need not', "needn't've": 'need not have', "o'clock": 'of the clock', "oughtn't": 'ought not', "oughtn't've": 'ought not have', "sha'n't": 'shall not', "shan't've": 'shall not have', "she'd've": 'she would have', "she'll've": 'she will have', "should've": 'should have', "shouldn't've": 'should not have', "so've": 'so have', "so's": 'so as', "this's": 'this is', "that'd": 'that would', "that'd've": 'that would have', "there'd": 'there would', "there'd've": 'there would have', "here's": 'here is', "they'd've": 'they would have', "they'll've": 'they will have', "to've": 'to have', "we'd've": 'we would have', "we'll've": 'we will have', "what'll've": 'what will have', "when's": 'when is', "when've": 'when have', "where'd": 'where did', "where've": 'where have', "who'll've": 'who will have', "why's": 'why is', "why've": 'why have', "will've": 'will have', "won't've": 'will not have', "would've": 'would have', "wouldn't've": 'would not have', "y'all": 'you all', "y'all'd": 'you all would', "y'all'd've": 'you all would have', "y'all're": 'you all are', "y'all've": 'you all have', "you'd've": 'you would have', "you'll've": 'you will have'}
  x= word_tokenize(text)
  for i in range(len(x)):
    if x[i] in EXTENDED_FORMS:
      x[i] = EXTENDED_FORMS[x[i]]
    if x[i] in stopwords.words('english'):
      x[i]=''
  x=remove_punc(x)
  x=remove_art_connector(x)
  return " ".join(x)

In [40]:
#Filter the parsed text, by, converting them into lowercase, removing any tags, extra spaces.
def filter(item):
  global vocab
  if type(item)==str:
    item=item.lower()
    item=re.sub('[#@]\w+\s*',"",item)
    item=re.sub(r'\\N','',item)
    item=remove_emoji(item)
    item=stopword(item)
    item=stemming(item)
    item = word_tokenize(item)
    item = Counter(item)
  return item

### TF-ICF and Selecting Top-K features

In [122]:
def posting_class(data):
    nested_posting_list = {}

    for files in data:
        for words in files['filtered_content']:
            if words in nested_posting_list:
                if files['file'].split('/')[0] in nested_posting_list[words]:
                    nested_posting_list[words][files['file'].split('/')[0]]+=1
                else:
                    nested_posting_list[words][files['file'].split('/')[0]]=1
            else:
                nested_posting_list[words]={}
                nested_posting_list[words][files['file'].split('/')[0]]=1
    
    return nested_posting_list

In [63]:
def CF(data,posting_list):
    cf = {}
    total_classes = 5

    for word in posting_list:
        cf[word] = math.log(total_classes/len(posting_list[word]))
    return cf

In [133]:
for files in tqdm(data):
    files['filtered_content'] = filter(files['content'])
n = len(data)

for split in [50,70,80]:
    shuffle(data)
    train_data,test_data = data[:int(split*n/100)],data[int(split*n/100):]
    print(len(train_data),len(test_data))
    
    pkl.dump(train_data,open(f'./pickle_files/train_data_{split}.pkl','wb'))
    pkl.dump(test_data,open(f'./pickle_files/test_data_{split}.pkl','wb'))
    

100%|██████████| 5000/5000 [05:52<00:00, 14.18it/s]


2500 2500
3500 1500
4000 1000


### Naive Bayes

In [134]:
class NaiveBayes:
    
    def __init__(self, features, labels):
        self.labelCount = {l: 0 for l in labels}
        self.features = {f: {} for f in range(features)}
        self.labelFeatures = {l : {f : {} for f in range(features)} for l in labels}
    
    def fit(self, X, y):
        for label in y:
            self.labelCount[int(label)] += 1
        
        for x, label in zip(X, y):
            for i, val in enumerate(x):
                self.features[i][val] = self.features[i].get(val, 0) + 1
                self.labelFeatures[label][i][val] = self.labelFeatures[label][i].get(val, 0) + 1
        
        self.totalSamples = len(X)
        
    def predict(self, X):
        y = [0] * len(X)
        
        for i, x in enumerate(X):
#             scores = [0] * len(self.labelCount)
            maxScore = 0
            for l in self.labelCount:
                score = 1
                for j, val in enumerate(x):
                    score *= ((self.labelFeatures[l][j].get(val, 0)/self.labelCount[l]) * (self.labelCount[l]/self.totalSamples) / (self.features[j][val]/self.totalSamples))
                if score > maxScore:
                    maxScore = score
                    y[i] = l
        return y

    def score(self,y,pred_y):
        accuracy = 0
        total = 0
        for i in range(len(y)):
            if y[i]==pred_y[i]:
                accuracy+=1
            total+=1
        return 100*accuracy/total

In [138]:
for split in [50,70,80]:
    print(f"For Split {split}:{100-split} ------------------------------")
    with open(f'./pickle_files/train_data_{split}.pkl','rb') as f:
        train_data = pkl.load(f)

    posting_list = posting_class(train_data)
    cf = CF(train_data,posting_list)

    for word in posting_list:
        for classes in posting_list[word]:
            posting_list[word][classes]*=cf[word]
    
    for word in posting_list:
        if 'talk.politics.misc' not in posting_list[word]:
            posting_list[word]['talk.politics.misc']=-1
        if 'sci.med' not in posting_list[word]:
            posting_list[word]['sci.med']=-1
        if 'sci.space' not in posting_list[word]:
            posting_list[word]['sci.space']=-1
        if 'comp.graphics' not in posting_list[word]:
            posting_list[word]['comp.graphics']=-1
        if 'rec.sport.hockey' not in posting_list[word]:
            posting_list[word]['rec.sport.hockey']=-1

    k=5
    top_k_politics = [x[0] for x in sorted(posting_list.items(),key=lambda x:x[1]['talk.politics.misc'],reverse=True)[:k]]
    top_k_scimed = [x[0] for x in sorted(posting_list.items(),key=lambda x:x[1]['sci.med'],reverse=True)[:k]]
    top_k_scispace = [x[0] for x in sorted(posting_list.items(),key=lambda x:x[1]['sci.space'],reverse=True)[:k]]
    top_k_compgraphics = [x[0] for x in sorted(posting_list.items(),key=lambda x:x[1]['comp.graphics'],reverse=True)[:k]]
    top_k_recsporthockey = [x[0] for x in sorted(posting_list.items(),key=lambda x:x[1]['rec.sport.hockey'],reverse=True)[:k]]

    print("Top K features for each class are ----------------")
    print(top_k_politics)
    print(top_k_scimed)
    print(top_k_scispace)
    print(top_k_compgraphics)
    print(top_k_recsporthockey)
    print("---------------------------------------------------")
    vocab = list(set(top_k_politics+top_k_scimed+top_k_scispace+top_k_compgraphics+top_k_recsporthockey))

    with open(f'./pickle_files/test_data_{split}.pkl','rb') as f:
        test_data = pkl.load(f)
    
    # print(len(train_data),len(test_data))
    train_X = np.zeros((len(train_data),5*k))
    test_X = np.zeros((len(test_data),k*5))

    train_y = np.zeros((len(train_data),))
    test_y = np.zeros((len(test_data),))
    # print(train_y.shape)
    class_map = {'talk.politics.misc':0,'sci.med':1,'sci.space':2,'comp.graphics':3,'rec.sport.hockey':4}

    for i,value in enumerate(train_data):
        for j,voc in enumerate(vocab):
            if voc in value['filtered_content']:
                train_X[i][j]=1
        train_y[i] = class_map[value['file'].split('/')[0]]

    for i,value in enumerate(test_data):
        for j,voc in enumerate(vocab):
            if voc in value['filtered_content']:
                test_X[i][j]=1
        test_y[i] = class_map[value['file'].split('/')[0]]

    train_y = train_y.astype(int)    
    test_y = test_y.astype(int)

    model = NaiveBayes(5*k,[0,1,2,3,4])
    model.fit(train_X,train_y)
    y_pred = model.predict(test_X)
    score = model.score(test_y,y_pred)
    print("The accuracy of the Model is- ", end=" ")
    print(score)

    confusion_matrix = np.zeros((k,k))
    for i in range(len(test_y)):
        confusion_matrix[test_y[i]][y_pred[i]]+=1

    print("Confusion Matrix is- ")
    for value in confusion_matrix:
        print(value)
    print("---------------------------------------------------")
    
    


    
    

For Split 50:50 ------------------------------
Top K features for each class are ----------------
['talk.politics.misc', 'ca.polit', 'cramer', 'soc.men', 'cramer.com']
['sci.m', 'geb.pitt.edu', 'rec.food.cook', 'n3jxp', 'chastiti']
['sci.spac', 'sci.astro', 'henry.toronto.edu', 'prb.digex.com', 'spacecraft']
['comp.graph', 'comp.graphics.anim', 'vga', 'polygon', 'tiff']
['rec.sport.hockey', 'nhl', 'hockey', 'playoff', 'bruin']
---------------------------------------------------
The accuracy of the Model is-  99.56
Confusion Matrix is- 
[503.   0.   1.   0.   0.]
[  2. 488.   0.   0.   0.]
[  1.   1. 479.   0.   0.]
[  2.   3.   1. 521.   0.]
[  0.   0.   0.   0. 498.]
---------------------------------------------------
For Split 70:30 ------------------------------
Top K features for each class are ----------------
['talk.politics.misc', 'ca.polit', 'talk.religion.misc', 'clayton', 'cramer']
['sci.m', 'geb.pitt.edu', 'n3jxp', 'chastiti', 'geb.dsl.pitt.edu']
['sci.spac', 'sci.astro', 's