In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

from sklearn.base import BaseEstimator, TransformerMixin

lemmatizer = WordNetLemmatizer()

stemmer = PorterStemmer()

stops = set(stopwords.words('english'))
stops.add('also')
stops.add("note")
stops.add("notes")
stops.add("instance")
stops.add("example")
stops.add("")


class TextPreProcessor(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def text_normalizer(self, text):
        normalized_text = text.lower()
        normalized_text = re.sub(r"_-\'\.", '', normalized_text)
        normalized_text = re.sub(r"\n,", ' ', normalized_text)
        normalized_text = re.sub(r"[^a-z ]", ' ', normalized_text)
        normalized_text = re.sub(r"[ ]+", ' ', normalized_text)

        return normalized_text

    def tokenize(self, text):  return word_tokenize(text)

    def stem_lemmatize_stopwordremoval(self, tokens):
        newTokens = set(tokens) - stops
#         newTokens = list(map(stemmer.stem, newTokens))
        newTokens = list(map(lemmatizer.lemmatize, newTokens))
        newTokens = [token for token in newTokens if len(token) >= 3]

        return newTokens

    def preProcessIt(self, text):
        normalized_text = self.text_normalizer(text)
        preporcessed_text = self.stem_lemmatize_stopwordremoval(self.tokenize(normalized_text))
        preporcessed_text = " ".join(preporcessed_text)

        return preporcessed_text


In [2]:
df = pd.read_csv(r"data\crime_news.csv")

In [3]:
preprocesser = TextPreProcessor()
df["preprocessed_text"] = df["news_article"].apply(preprocesser.preProcessIt)

In [4]:
df

Unnamed: 0,news_article,label,preprocessed_text
0,POLICE INVESTIGATE ARSON CLAIM IN FIRE THAT TO...,arson,week several gunung people told work white hos...
1,JAPAN: KINKAKUJI TEMPLE HOLDS CEREMONY RECALLI...,arson,kyodo day old july hold monk official charge t...
2,NASI KANDAR RESTAURANT OPERATOR LOSES MPV IN A...,arson,contacted ahmad due people told vehicle yard i...
3,UNEMPLOYED CHANGES PLEA OVER ARSON\nCourt-Arso...,arson,prokhong belonging masquerading renggan foot w...
4,JUVENILES PLEAD NOT GUILTY TO ARSON ATTEMPTS O...,arson,around hall school silaturrahim alleged mansor...
...,...,...,...
2246,Lorry driver arrested over hit-and-run\nSource...,traffic,attending due old run school body killed head ...
2247,Nurse charged with causing death by reckless d...,traffic,passenger however excellent account bearing en...
2248,`Speeding forklift summons not cancelled yet'\...,traffic,wong contacted cancelled police bearing end st...
2249,Driver: I was not speeding\nSource:New Straits...,traffic,end driving emerged stuck overturning several ...


In [5]:
tf = CountVectorizer(max_features=10000)
train_x = tf.fit_transform(df["preprocessed_text"])

In [6]:
train_x

<2251x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 227971 stored elements in Compressed Sparse Row format>

In [13]:
from scipy.sparse import find

print(find(train_x)[0][np.where(find(train_x)[1]==0)])

[132 205 244]


In [14]:

from collections import Counter
import numpy as np
import math
import copy
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.sparse import find
from bisect import bisect_left

class NaiveBayes(BaseEstimator, TransformerMixin): 

    def __init__(self):        
        self._feats_prob = {}
        self._alpha =1E-20

    def _array1D(self, size):       return np.zeros(size, dtype=float) 
    
    def get_features(self):
        return np.array([k for k in self._feats_prob.keys()])
    
    def fit(self, X, y):          
        y_ = np.array(copy.deepcopy(y))
        self._labels = y.unique() # unique labels as an array
        self._labelSize = len(self._labels)
        feats_all = find(X)[1] # representation of all features
        examples_all = find(X)[0]  # representation of all documents 
        feats = set(feats_all)
        examples = set(examples_all)
        exampleSize = len(examples)   
#         labeln = len(self._labels) #size of labels  
        labelExamples = [set()] * self._labelSize
 
        for i in range(self._labelSize):
            labelExamples[i] = [j for j in examples_all if y_[j]==self._labels[i]]
        probs = []    
        for feat in feats:
            prob = self._array1D(self._labelSize)
            for label in range(self._labelSize):
                occurances = len(set(examples_all[np.where(feats_all == feat)]).intersection(labelExamples[label] ))
                 
                prob[label] = (occurances+self._alpha)* len(labelExamples[i])/exampleSize
            probs.append(prob)


        self._feats_prob = dict(zip(feats, probs)) #probability of features assigned to each label
    
    def _pred_oneEx(self,X,x, feats_probs):
        # X: sparse matrix of big test data/new exmaples
        # x: current index
        max_prob = -0.001
        pred_label = -1
        probs = [np.prod([feats_probs.get(feat)[label] for feat in X[x].indices])
                for label in range(len(self._labels))]
        return self._labels[probs.index(max(probs))]
    
    def predict(self,X):
              
        return [self._pred_oneEx(X,x, self._feats_prob) for x in set(find(X)[0])]
    
    def evaluate_chromo(self, X,y, chromo):
        ones = [1]*len(self._labels)
        chromo_feats = self.get_features()[np.where(chromo==1)]
        chromo_feats_prob = copy.deepcopy(self._feats_prob)
        for feat in chromo_feats:
            chromo_feats_prob[feat] = ones
            
        lbls =  [self._pred_oneEx(X,x, chromo_feats_prob) for x in set(find(X)[0])]
        return f1_score(lbls, y, average="macro")
    
    def get_optimal_feat(self, chromo): 
        self._feats_prob = self._feats_prob[np.where(current_solution==1)]
        

In [15]:
naivebayes = NaiveBayes()

In [17]:
train_y = df['label']

In [18]:
naivebayes.fit(train_x, train_y)

In [19]:
lbls = naivebayes.predict(train_x)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [20]:
from sklearn.metrics import f1_score
f1_score(lbls, train_y, average="macro")

0.9526414943497565

In [21]:

import random
import numpy as np
import math

def optimize( x_valid,  y_valid, model, max_temp ,temp_decrement, interval):
    
        '''intialize a solution, the solution will be a binary vector in the form
            10011000011110000001110101111000010111000101...........0110
          the length of the solution will be equivelent to the length of our vocabulary
          1 : means the corresponding word in the vocabolary is selected
          0 : otherwise
        '''
        def initial_solution():
            return np.array([random.randint(0, 1)  for _ in range(len(model.get_features()))])

 
        def create_new_solution(current_solution):
            # SA algo
            new_solution = current_solution.copy()
            sol_length = len(new_solution) # array of 0,1 (chromosomes)
            number_of_neighbours = random.randint(int(sol_length / 5), int(sol_length / 3))
            neighbours = set([random.randint(0, sol_length - 1) for _ in range(number_of_neighbours)])
            for neighbour in neighbours: new_solution[neighbour] = 1 - new_solution[neighbour]
            return np.array(new_solution)
       
        
        def evaluate_new_solution(solution ):
          
            return model.evaluate_chromo(x_valid, y_valid, solution)
            

        current_solution = initial_solution()
        it  = 0
        T = max_temp
        f_initial = evaluate_new_solution( current_solution)
        while (f_initial<1.0 and T>1) :
            
            new_solution = create_new_solution(current_solution)
            
            f_gener = evaluate_new_solution( new_solution)
            if (f_initial > f_gener):
                p = math.exp((f_initial - f_gener) / (abs(T)+1))
                if p < np.random.random(): current_solution = new_solution.copy()
            elif (f_initial < f_gener):
                current_solution = new_solution.copy()
                f_initial = f_gener
                print('f1={} , temperature={}'.format(round(f_gener,5), round(T, 3)))
            if it % interval == 0:
                T = T*temp_decrement
            
            it += 1
        return  current_solution #Chromosomes of optimal solution only

In [22]:
optimal_sol=optimize( train_x,  train_y, naivebayes, 100 ,0.8, 5)

f1=0.95503 , temperature=100
f1=0.9559 , temperature=80.0
f1=0.95631 , temperature=80.0
f1=0.9566 , temperature=64.0
f1=0.95733 , temperature=26.214
f1=0.95883 , temperature=16.777
f1=0.96011 , temperature=13.422


array([0, 0, 0, ..., 0, 1, 0])

In [None]:
# TODO: Translate chromosomes into naive bayes features
naive_bayes.get_optimal_feat(optimal_sol)

In [None]:
# TODO: Split data into train (fit) validation (optimize) test (test) sets
# TODO: Tackling overfitting due to too many iterations 