In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# **1. Pre-Processing**
1. Tokenization
2. Stemming/lemmatization
3. Bow/TF-IDF 

In [2]:
from nltk.stem import WordNetLemmatizer
import re
import numpy as np

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [3]:
def tokenize_lemmatizor(frame):
    words = []
    lemma_words = []
    lemma_sentences = []
    lemmatizer = WordNetLemmatizer()

    for i in range(len(frame)):
        words = nltk.word_tokenize(frame.iloc[i])
        lemma_words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
        lemma_sentences.append(" ".join(lemma_words))

    return lemma_sentences


def re_lemmatizor(frame):
    lemmatizer = WordNetLemmatizer()
    review = []
    corpus = []

    for i in range(len(frame)):
        review = re.sub('[^a-zA-Z]', ' ', frame.iloc[i])
        review = review.lower()
        review = review.split()
        # these lines represent - words = nltk.word_tokenize(frame.cmd[i])

        review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
        # lemma_words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))])
        
        corpus.append(" ".join(review))
        # lemma_sentences.append(" ".join(lemma_words))

    return corpus

### 3.1. BOW

In [4]:
from sklearn.feature_extraction.text import CountVectorizer # bow
def tokens_to_bow(corpus_tokens, tokenizer=1):
    cv = CountVectorizer(max_features=5000)
    tokens = []
    if tokenizer == 1:
        tokens = tokenize_lemmatizor(corpus_tokens)
        X_bow = cv.fit_transform(tokens).toarray()
    else:
        tokens = re_lemmatizor(corpus_tokens)
        X_bow = cv.fit_transform(tokens).toarray()
    features = cv.get_feature_names_out()
    return X_bow, features

### 3.2. TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer # tfidf
def tokens_to_tfidf(corpus_tokens, tokenizer=1):
    tfidf = TfidfVectorizer()
    tokens = []
    if tokenizer:
        tokens = tokenize_lemmatizor(corpus_tokens)
        X_tfidf = tfidf.fit_transform(tokens).toarray()
    else:
        tokens = re_lemmatizor(corpus_tokens)
        X_tfidf = tfidf.fit_transform(tokens).toarray()
    return X_tfidf, tokens

## **Implementation**

In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
from random import randint, choices, randrange, random, sample, shuffle


from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.naive_bayes import MultinomialNB

from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score

In [7]:
from sklearn.model_selection import train_test_split
def split(df,label):
    X_tr, X_te, Y_tr, Y_te = train_test_split(df, label, test_size=0.25, random_state=42)
    return X_tr, X_te, Y_tr, Y_te

In [8]:
classifiers = ['LinearSVM', 'RadialSVM', 
               'Logistic',  'RandomForest', 
               'DecisionTree', 'KNeighbors',
               'MultinomialNB']

models = [svm.SVC(kernel='linear'),
          svm.SVC(kernel='rbf'),
          LogisticRegression(max_iter = 1000),
          RandomForestClassifier(n_estimators=200, random_state=0),
          DecisionTreeClassifier(random_state=0),
          KNeighborsClassifier(),
          MultinomialNB()]

In [9]:
def acc_score(df,label):
    Score = pd.DataFrame({"Classifier":classifiers})
    j = 0
    acc = []
    exec_time = []
    X_train,X_test,Y_train,Y_test = split(df,label)
    for i in models:
        model = i

        st = time.time()
        model.fit(X_train,Y_train)
        et = time.time()

        predictions = model.predict(X_test)
        acc.append(accuracy_score(Y_test,predictions))
        exec_time.append(et-st)
        j = j+1     
    Score["Accuracy"] = acc
    Score['Exec_Time_secs'] = exec_time
    Score.sort_values(by="Accuracy", ascending=False,inplace = True)
    Score.reset_index(drop=True, inplace=True)
    return Score

In [10]:
def initial_population_term_selection_idf(idf, idf_threshold):
    selected_indexes = []
    selected_terms = []
    idf_dict = dict(idf)
    for word, idf in idf_dict.items():
        if idf <= idf_threshold:
            selected_terms.append(word)
            selected_indexes.append(all_terms.index(word))
    return selected_indexes, selected_terms

len(initial_population_term_selection_idf(idf, 2.6)[1])
418
len(initial_population_term_selection_idf(idf, 2.7)[1])
651
len(initial_population_term_selection_idf(idf, 2.8)[1])
651
len(initial_population_term_selection_idf(idf, 2.9)[1])
651
len(initial_population_term_selection_idf(idf, 3)[1])
1553

In [11]:
def generate_chromo(selected_indexes, features_count, chromo_size):
    features = sample(selected_indexes, k=features_count)
    features.sort()
    chromo = [1 if i in features else 0 for i in range(chromo_size)]
    return np.array(chromo)

def generate_population(size, features_count, chromo_size, selected_indexes):
    return [generate_chromo(selected_indexes, features_count, chromo_size) for _ in range(size)]

In [12]:
def fitness_score(population):
    scores = []
    for chromosome in population:
        features = np.where(chromosome!=0)[0]
        logmodel.fit(X_train[:,features],Y_train)         
        predictions = logmodel.predict(X_test[:,features])
        scores.append(accuracy_score(Y_test,predictions))
    scores, population = np.array(scores), np.array(population) 
    inds = np.argsort(scores)                                    
    return list(scores[inds][::-1]), list(population[inds,:][::-1])

In [13]:
def de_fitness(chromo_set):
    max_score = 0
    best_chromo = None

    for chromo in chromo_set:
        features = np.where(chromo!=0)[0]
        logmodel.fit(X_train[:,features],Y_train)         
        predictions = logmodel.predict(X_test[:,features])
        score =  accuracy_score(Y_test,predictions)
        if max_score < score:
            max_score = score
            best_chromo = chromo

    return max_score, best_chromo                                

In [14]:
def de_crossover(parent1, parent2, probability):
    child = []
    parent_1, parent_2 = parent1.copy(), parent2.copy()
    chromo_len = parent_1.shape[0]
    
    for i in range(chromo_len):
        if random() < probability:
            child.append(parent_1[i])
        else:
            child.append(parent_2[i])
    child = np.array(child)
    
    # there is randomization in this part, in future incase of any unexpected results, have to concentrate in this part
    features = np.where(child > 0)[0]
    non_features = np.where(child <= 0)[0]

    # must_have_features = np.intersect1d(features, selected_indexes)
    # extra_features = np.setdiff1d(np.union1d(features, non_features), must_have_features)
    # # print(len(must_have_features))
    
    # features_count = len(must_have_features)
    # if len(must_have_features) > 100:
    #     to_remove = features_count - 100
    #     features = np.setdiff1d(must_have_features, sample(list(must_have_features), k=to_remove))
    # else:
    #     to_add = 100 - features_count
    #     features = np.append(must_have_features, sample(list(extra_features), k=to_add))
    
    features.sort()
    new_child = np.array([1 if i in features else 0 for i in range(chromo_len)])
    return de_fitness([new_child, parent_1])[1]

In [15]:
def de_mutation(pop_after_fit, co_probability, f_score, tfidf_threshold, n_parents):
    # getting the population size
    pop_size = len(pop_after_fit)
    
    # getting the length of the chromosome
    chromo_len = len(pop_after_fit[0])
    # print(chromo_len)

    # new variable for the mutated population
    pop_nextgen = []

    # looping throught all the parent chromos in population
    for target in range(pop_size):
        # sample_space = list(range(pop_size))
        # sample_space.remove(target)
        
        tf_idf_sent_score = term_frequency_inverse_document_frequency(pop_after_fit)
        sent_indexes = [i for i, j in tf_idf_sent_score if (j <= tfidf_threshold) and (i != target)]

        # sent_indexes = [tf_idf_sent_score[i][0] for i in range(len(tf_idf_sent_score)) if tf_idf_sent_score[i][0] != target]
        # sent_indexes = sent_indexes[:30]
        # sent_indexes.sort()
        # print(sent_indexes)
        # random selection of target chromo, and 2 random chromos
        rv1, rv2, rv3 = sample(sent_indexes, k=3)
        
        target_vec = pop_after_fit[target].astype(np.float32)
        random_vec1 = pop_after_fit[rv1]
        random_vec2 = pop_after_fit[rv2]
        random_vec3 = pop_after_fit[rv3]

        # performing the DE mutation
        trail_vec = random_vec1 + f_score*(random_vec2-random_vec3)
        # x1_features = np.where(random_vec1 > 0)[0]
        # x2_features = np.where(random_vec2 > 0)[0]
        # x3_features = np.where(random_vec3 > 0)[0]
        
        # common_features = np.intersect1d(x1_features, x2_features)
        # common_features = np.intersect1d(common_features, x3_features)
        # print(common_features, len(common_features))
        
        # print("x1", *random_vec1)
        # print("x2", *(random_vec2-random_vec3))
        # print("u1", *trail_vec)
        # print(len(np.where(trail_vec > 0)[0]))
        
        features = np.where(trail_vec > 0.5)[0]
        non_features = np.where(trail_vec <= 0.5)[0]
        
        #----------------------#
        # vec1_features = np.where(random_vec1 <= 0)[0]
        # not_imp_features = np.setdiff1d(features, vec1_features)
        
        # if len(features) > 100:
        #     to_remove = len(features) - 100
        #     features = np.setdiff1d(features, sample(list(not_imp_features), k=to_remove))
        # elif len(features) < 100:
        #     to_add = 100 - len(features)
        #     features = np.append(features, s  ample(list(vec1_features), k=to_add))
        #----------------------#
        
        features.sort()
        trail_vec = np.array([1 if i in features else 0 for i in range(chromo_len)])
        trail_vec = trail_vec.astype(np.int64)
        
        new_trail = de_crossover(target_vec, trail_vec, co_probability)   
        
        pop_nextgen.append(new_trail)

    return pop_nextgen

In [16]:
def term_frequency(population):
    tf_sent = []
    tf_dict = {}
    total_no_terms = len(population)
    for chromosome in population:
        chromo_tf = []
        indexes = np.where(chromosome!=0)
        for i in indexes[0]:
            chromo_tf.append(chromosome[i]/total_no_terms)
            tf_dict[all_terms[i]] = tf_dict.get(all_terms[i], 0) + (chromosome[i]/total_no_terms)
        tf_sent.append(chromo_tf)
    
    tf_terms = sorted(tf_dict.items(), key=lambda x: x[1], reverse=True)
    return tf_sent, tf_terms

In [17]:
def inverse_document_frequency(population):
    idf = {}
    terms = np.array(list(all_terms))
    no_documents = len(population)
    for i in range(len(all_terms)):
        k = 0
        for chromosome in population:
            indexes = np.where(chromosome!=0)
            if terms[i] in terms[indexes]:
                k += 1
        idf[terms[i]] = np.log10(no_documents/k)
    idf = sorted(idf.items(), key=lambda x: x[1], reverse=True)
    return idf

In [18]:
def term_frequency_inverse_document_frequency(population):
    tf_sent, tf_terms = term_frequency(population)
    tf_idf = {}
    idf_dict = dict(idf)
    for i in range(len(population)):
        tf_idf_sent = []
        indexes = np.where(population[i] != 0)[0]
        for j in range(len(indexes)):
            idf_term = idf_dict[all_terms[indexes[j]]]
            tf = tf_sent[i][j]
            tf_idf_sent.append(tf*idf_term)
        tf_idf[i] = sum(tf_idf_sent)/len(indexes)
    tf_idf = sorted(tf_idf.items(), key=lambda x: x[1], reverse=True)
    return tf_idf

In [19]:
def evolution(size, features_count, chromo_size,
            n_parents,
            crossover_pb,
            f_score,
            n_gen,
            idf, idf_threshold,
            tfidf_threshold
            ):
    best_chromo= []
    best_score= []
    
    selected_indexes, selected_terms = initial_population_term_selection_idf(idf, idf_threshold)
    population_nextgen=generate_population(size, features_count, chromo_size, selected_indexes)
    # scores, pop_after_fit = fitness_score(population_nextgen)
    # population_nextgen = pop_after_fit.copy()

    for i in range(n_gen):
        scores, pop_after_fit = fitness_score(population_nextgen.copy())
        best_chromo.append(pop_after_fit[0])
        best_score.append(scores[0])
        print('Best score in generation',i+1,':',scores[0], "feat_count:", np.where(pop_after_fit[0] != 0)[0].shape)
        
        population_nextgen = de_mutation(population_nextgen.copy(), crossover_pb, f_score, tfidf_threshold, n_parents)
        
        print("Population size:", len(population_nextgen))
        
    return best_chromo,best_score

# **Accuracy Comparison**

### Data Preprocessing

In [20]:
amazon = pd.read_csv("../dataset/yelp.csv", encoding='latin-1')
frame = amazon.copy()
frame

Unnamed: 0,cmd,score
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [21]:
X_bow, features = tokens_to_bow(frame.cmd, 0)
y_score = frame.score
all_terms = list(features)

In [22]:
X_train, X_test, Y_train, Y_test = split(X_bow, y_score)

In [23]:
idf = inverse_document_frequency(X_bow)
tf_sent, tf_terms = term_frequency(X_bow)

### Compare models without GA

In [None]:
all_models_score_table = acc_score(X_bow, y_score)
all_models_score_table

Unnamed: 0,Classifier,Accuracy,Exec_Time_secs
0,RandomForest,0.951872,4.238636
1,LinearSVM,0.949198,2.323768
2,DecisionTree,0.941176,0.926516
3,Logistic,0.933155,0.376991
4,RadialSVM,0.911765,2.52124
5,MultinomialNB,0.893048,0.034906
6,KNeighbors,0.73262,0.000997


### Choosing the best classifier and starting evolution

In [None]:
logmodel = RandomForestClassifier(n_estimators=200, random_state=0)

In [None]:
selected_indexes, selected_terms = initial_population_term_selection_idf(idf, 2.7)
population_nextgen=generate_population(100, 100, 1553, selected_indexes)
scores, pop_after_fit = fitness_score(population_nextgen)
tf_idf_sent_score = term_frequency_inverse_document_frequency(pop_after_fit)

In [None]:
# st = time.time()
# chromo_set_1, score_set_1 = evolution(
#     size=100,
#     features_count=100,
#     chromo_size=X_bow.shape[1],
#     n_parents=80,
#     crossover_pb=0.8,
#     n_gen=100,
#     idf=idf, 
#     idf_threshold=2.7,
#     tfidf_threshold=0.0240
# )
# et = time.time()
# exce_time_1 = et-st

In [None]:
# import pickle
# with open('single_run_az_kbde.pkl', 'wb') as wf:
#     pickle.dump([chromo_set_1, score_set_1, exce_time_1], wf)

In [None]:
def run_n_evolution(n):
    result_n_runs = []
    for i in range(n):
        st = time.time()
        chromo_set_2, score_set_2 = evolution(
            size=100, 
            features_count=100,
            chromo_size=X_bow.shape[1],
            n_parents=80,
            crossover_pb=0.4,
            f_score=0.4,
            n_gen=100,
            idf=idf, 
            idf_threshold=2.7,
            tfidf_threshold=0.0240
        )
        et = time.time()
        result_n_runs.append((chromo_set_2, score_set_2, et-st))
    return result_n_runs

In [None]:
results = run_n_evolution(30)

Best score in generation 1 : 0.688 feat_count: (100,)
Population size: 100
Best score in generation 2 : 0.716 feat_count: (102,)
Population size: 100
Best score in generation 3 : 0.72 feat_count: (99,)
Population size: 100
Best score in generation 4 : 0.72 feat_count: (99,)
Population size: 100
Best score in generation 5 : 0.752 feat_count: (103,)
Population size: 100
Best score in generation 6 : 0.752 feat_count: (103,)
Population size: 100
Best score in generation 7 : 0.752 feat_count: (103,)
Population size: 100
Best score in generation 8 : 0.764 feat_count: (106,)
Population size: 100
Best score in generation 9 : 0.784 feat_count: (104,)
Population size: 100
Best score in generation 10 : 0.784 feat_count: (104,)
Population size: 100
Best score in generation 11 : 0.788 feat_count: (107,)
Population size: 100
Best score in generation 12 : 0.788 feat_count: (107,)
Population size: 100
Best score in generation 13 : 0.788 feat_count: (107,)
Population size: 100
Best score in generation 

In [None]:
import pickle
with open('30_run_yelp_kbde_pt_final_04.pkl', 'wb') as wf:
    pickle.dump(results, wf)