In [2]:
import nltk
nltk.download('punkt')


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DAAI.DESKTOP-
[nltk_data]     128VDS1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DAAI.DESKTOP-
[nltk_data]     128VDS1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DAAI.DESKTOP-
[nltk_data]     128VDS1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\DAAI.DESKTOP-
[nltk_data]     128VDS1\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# **1. Pre-Processing**
1. Tokenization
2. Stemming/lemmatization
3. Bow/TF-IDF 

In [3]:
from nltk.stem import WordNetLemmatizer
import re
import numpy as np
      
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [4]:
def tokenize_lemmatizor(frame):
    words = []
    lemma_words = []
    lemma_sentences = []
    lemmatizer = WordNetLemmatizer()

    for i in range(len(frame)):
        words = nltk.word_tokenize(frame.iloc[i])
        lemma_words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
        lemma_sentences.append(" ".join(lemma_words))

    return lemma_sentences

In [5]:
def re_lemmatizor(frame):
    lemmatizer = WordNetLemmatizer()
    review = []
    corpus = []

    for i in range(len(frame)):
        review = re.sub('[^a-zA-Z]', ' ', frame.iloc[i])
        review = review.lower()
        review = review.split()
        # these lines represent - words = nltk.word_tokenize(frame.cmd[i])

        review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
        # lemma_words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))])
        
        corpus.append(" ".join(review))
        # lemma_sentences.append(" ".join(lemma_words))

    return corpus

### 3.1. BOW

In [6]:
def custom_tokens_bow(corpus_tokens):
    tokens = re_lemmatizor(corpus_tokens)
    vocab = []
    for sentence in tokens:
        vocab.extend(sentence.split())
    
    features = list(set(vocab))
    bow = []
    for sentence in tokens:
        sent_dict = {}
        for word in sentence.split():
            sent_dict[word] = sent_dict.get(word, 0) + 1
        bow.append([sent_dict[feature] if feature in sent_dict.keys() else 0 for feature in features ])
    # print("Total Vocab Count:", len(features))
    features = np.array(features)
    return np.array(bow), features

In [7]:
from sklearn.feature_extraction.text import CountVectorizer # bow
def tokens_to_bow(corpus_tokens, tokenizer=1):
    cv = CountVectorizer(max_features=5000)
    tokens = []
    if tokenizer == 1:
        tokens = tokenize_lemmatizor(corpus_tokens)
        X_bow = cv.fit_transform(tokens).toarray()
    else:
        tokens = re_lemmatizor(corpus_tokens)
        X_bow = cv.fit_transform(tokens).toarray()
    features = cv.get_feature_names_out()
    return X_bow, features

### 3.2. TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer # tfidf
def tokens_to_tfidf(corpus_tokens, tokenizer=1):
    tfidf = TfidfVectorizer()
    tokens = []
    if tokenizer:
        tokens = tokenize_lemmatizor(corpus_tokens)
        X_tfidf = tfidf.fit_transform(tokens).toarray()
    else:
        tokens = re_lemmatizor(corpus_tokens)
        X_tfidf = tfidf.fit_transform(tokens).toarray()
    return X_tfidf, tokens

## **Implementation**

In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
from random import randint, choices, randrange, random, sample, shuffle

In [10]:
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.naive_bayes import MultinomialNB

from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score

In [11]:
from sklearn.model_selection import train_test_split
def split(df,label):
    X_tr, X_te, Y_tr, Y_te = train_test_split(df, label, test_size=0.25, random_state=42)
    return X_tr, X_te, Y_tr, Y_te

In [12]:
classifiers = ['LinearSVM', 'RadialSVM', 
               'Logistic',  'RandomForest', 
               'DecisionTree', 'KNeighbors',
               'MultinomialNB']
               

In [13]:
models = [svm.SVC(kernel='linear'),
          svm.SVC(kernel='rbf'),
          LogisticRegression(max_iter = 1000),
          RandomForestClassifier(n_estimators=200, random_state=0),
          DecisionTreeClassifier(random_state=0),
          KNeighborsClassifier(),
          MultinomialNB()]

In [14]:
def acc_score(df,label):
    Score = pd.DataFrame({"Classifier":classifiers})
    j = 0
    acc = []
    exec_time = []
    X_train,X_test,Y_train,Y_test = split(df,label)
    for i in models:
        model = i

        st = time.time()
        model.fit(X_train,Y_train)
        et = time.time()

        predictions = model.predict(X_test)
        acc.append(accuracy_score(Y_test,predictions))
        exec_time.append(et-st)
        j = j+1     
    Score["Accuracy"] = acc
    Score['Exec_Time_secs'] = exec_time
    Score.sort_values(by="Accuracy", ascending=False,inplace = True)
    Score.reset_index(drop=True, inplace=True)
    return Score

In [15]:
def plot(score,x,y,c = "b"):
    gen = [1,2,3,4,5]
    plt.figure(figsize=(6,4))
    ax = sns.pointplot(x=gen, y=score,color = c )
    ax.set(xlabel="Generation", ylabel="Accuracy")
    ax.set(ylim=(x,y))

In [16]:
def initial_population_term_selection_tf(tf_terms, tf_threshold):
    selected_indexes = []
    selected_terms = []
    tf_dict = dict(tf_terms)
    for word, tf in tf_dict.items():
        if tf >= tf_threshold:
            selected_terms.append(word)
            selected_indexes.append(all_terms.index(word))
    return selected_indexes, selected_terms

len(initial_population_term_selection_tf(tf_terms, 0.001)[1])
1566
len(initial_population_term_selection_tf(tf_terms, 0.002)[1])
655
len(initial_population_term_selection_tf(tf_terms, 0.003)[1])
427

In [17]:
def initial_population_term_selection_idf(idf, idf_threshold):
    selected_indexes = []
    selected_terms = []
    idf_dict = dict(idf)
    for word, idf in idf_dict.items():
        if idf <= idf_threshold:
            selected_terms.append(word)
            selected_indexes.append(all_terms.index(word))
    return selected_indexes, selected_terms

len(initial_population_term_selection_idf(idf, 2.6)[1])
418
len(initial_population_term_selection_idf(idf, 2.7)[1])
651
len(initial_population_term_selection_idf(idf, 2.8)[1])
651
len(initial_population_term_selection_idf(idf, 2.9)[1])
651
len(initial_population_term_selection_idf(idf, 3)[1])
1553

In [18]:
def generate_chromo(selected_indexes, features_count, chromo_size):
    features = sample(selected_indexes, k=features_count)
    features.sort()
    chromo = [1 if i in features else 0 for i in range(chromo_size)]
    return np.array(chromo)

In [19]:
def generate_population(size, features_count, chromo_size, selected_indexes):
    return [generate_chromo(selected_indexes, features_count, chromo_size) for _ in range(size)]

In [20]:
def single_point_crossover(pop_after_sel, probability):
    shuffle(list(pop_after_sel))
    pop_nextgen = pop_after_sel
    length = len(pop_nextgen)
    chrom_l = len(pop_nextgen[0])

    tf_idf_sent_score = dict(term_frequency_inverse_document_frequency(pop_after_sel))
    pop_sorted_tfidf = np.array(sorted(tf_idf_sent_score.items(), key=lambda x: x[1], reverse=True))[:, 0]
    
    mid = len(pop_sorted_tfidf)//2
    pop_1 = pop_sorted_tfidf[:mid]
    pop_2 = pop_sorted_tfidf[mid:]
    
    i=0
    j=0
    m = len(pop_1)
    n = len(pop_2)
    # print(m, n)
    
    while i<m and j<n:
        parent_1, parent_2 = pop_after_sel[int(pop_1[i])], pop_after_sel[int(pop_2[j])]
        if random() <= probability:
            k = randint(1, chrom_l - 1)  # crossover_point
            new_child_1 = np.concatenate([parent_1[:k], parent_2[k:]])
            new_child_2 = np.concatenate([parent_2[:k], parent_1[k:]])
            pop_nextgen.append(new_child_1)
            pop_nextgen.append(new_child_2)
        else:
            pop_nextgen.append(parent_1)
            pop_nextgen.append(parent_2)
        i+=1
        j+=1
        
    while i<m:
        parent_1 = pop_after_sel[int(pop_1[i])]
        pop_nextgen.append(parent_1)
        i+=1
    
    while j<n:
        parent_2 = pop_after_sel[int(pop_2[j])]
        pop_nextgen.append(parent_2)
        j+=1
    
    return pop_nextgen

In [21]:
def single_point_crossover1(pop_after_sel, n_parents):
    shuffle(list(pop_after_sel))
    pop_nextgen = pop_after_sel.copy()
    length = len(pop_nextgen)
    chromo_l = len(pop_nextgen[0])

    tf_idf_sent_score = dict(term_frequency_inverse_document_frequency(pop_after_sel))
    pop_sorted_tfidf = np.array(sorted(tf_idf_sent_score.items(), key=lambda x: x[1]))[:, 0]
    
    mid = len(pop_sorted_tfidf)//2
    pop_1 = pop_sorted_tfidf[:mid]
    pop_2 = pop_sorted_tfidf[mid:]

    for i in range(0, mid):
        parent_1, parent_2 = pop_after_sel[int(pop_1[i])].copy(), pop_after_sel[int(pop_2[i])].copy()
        p1_features = list(np.where(np.array(parent_1) != 0)[0])
        p2_features = list(np.where(np.array(parent_2) != 0)[0])

        k = randint(0,len(p1_features))  # crossover_point
        c1_features = p1_features[:k] + p2_features[k:]
        c2_features = p2_features[:k] + p1_features[k:]

        c1_dup = list(set([i for i in c1_features if c1_features.count(i) > 1]))
        c2_dup = list(set([i for i in c2_features if c2_features.count(i) > 1]))
            # print("duplicates:", c1_dup, c2_dup)
            
        if len(c1_dup) > 0:
            sample_pop1 = [i for i in p1_features if i not in c1_features]
            k1 = sample(sample_pop1, k=len(c1_dup))
            for i in c1_dup:
                c1_features.remove(i)
            c1_features.extend(k1)
        elif len(c2_dup) > 0:
            sample_pop2 = [i for i in p2_features if i not in c2_features]
            k2 = sample(sample_pop2, k=len(c2_dup))
            for i in c2_dup:
                c2_features.remove(i)
            c2_features.extend(k2)

        new_child_1 = np.array([1 if i in c1_features else 0 for i in range(chromo_l)])
        new_child_2 = np.array([1 if i in c2_features else 0 for i in range(chromo_l)])
        pop_nextgen.append(new_child_1)
        pop_nextgen.append(new_child_2)
        
    _, pop_nextgen = fitness_score(pop_nextgen)
    return pop_nextgen[:n_parents]

In [22]:
def bit_flip_mutation(pop_after_cross, probability, mutation_rate1, mutation_rate2, n_feat):   
    n_feat = pop_after_cross[0].shape[0]
    # print(n_feat)
    
    range1 = int(mutation_rate1*n_feat)
    range2 = int(mutation_rate2*n_feat)
    pop_next_gen = []
    
    tf_idf_sent_score = dict(term_frequency_inverse_document_frequency(pop_after_cross))
    pop_sorted_tfidf = np.array(sorted(tf_idf_sent_score.items(), key=lambda x: x[1], reverse=True))[:, 0]
    
    mid = len(pop_sorted_tfidf)//2+1
    for n in pop_sorted_tfidf:
        if mid >= 0:
            mutation_range = range1
        else:
            mutation_range = range2
            
        chromo = pop_after_cross[int(n)]
        rand_posi = []
        if random() <= probability:
            for i in range(0,mutation_range):
                pos = randint(0,n_feat-1)
                rand_posi.append(pos)
            for j in rand_posi:
                chromo[j] = abs(chromo[j] - 1)
        pop_next_gen.append(chromo)
        mid -= 1
    
    return pop_next_gen

In [23]:
def bit_flip_mutation1(pop_after_cross, mutation_rate1, mutation_rate2, features_count, n_parents):   
    range1 = int(mutation_rate1*features_count)
    range2 = int(mutation_rate2*features_count)
    pop_next_gen = pop_after_cross.copy()
    # print(range1, range2)
    tf_idf_sent_score = dict(term_frequency_inverse_document_frequency(pop_after_cross))
    pop_sorted_tfidf = np.array(sorted(tf_idf_sent_score.items(), key=lambda x: x[1]))[:, 0]
    
    mid = len(pop_sorted_tfidf)//2+1
    for n in pop_sorted_tfidf:
        if mid >= 0:
            mutation_range = range1
        else:
            mutation_range = range2
            
        chromo = pop_after_cross[int(n)].copy()
        features = list(np.where(chromo != 0)[0])
        non_features = list(np.setdiff1d(np.array(range(chromo.shape[0])), features))

        rand_posi = []

        features_pos = sample(features, k=mutation_range)
        non_features_pos = sample(non_features, k=mutation_range)
        rand_posi.extend(features_pos)
        rand_posi.extend(non_features_pos)
        for j in rand_posi:
            chromo[j] = abs(chromo[j] - 1)
            
        pop_next_gen.append(chromo)
        mid -= 1

    _, pop_next_gen = fitness_score(pop_next_gen)
    return pop_next_gen[:n_parents+20], _

In [24]:
def varient_pressure_selection(pop_after_fit, tfidf_threshold):
    tf_idf_sent_score = term_frequency_inverse_document_frequency(pop_after_fit)
    sent_indexes = [i for i, j in tf_idf_sent_score if j <= tfidf_threshold]
    sent_indexes.sort()
    population_nextgen = []
    for i in sent_indexes:
        population_nextgen.append(pop_after_fit[i])
    # print("from selec: ", tf_idf_sent_score)
    return population_nextgen

In [25]:
def fitness_score(population):
    scores = []
    for chromosome in population:
        indexes = np.where(chromosome!=0)[0]
        logmodel.fit(X_train[:,indexes],Y_train)    
        predictions = logmodel.predict(X_test[:,indexes])
        scores.append(accuracy_score(Y_test,predictions))
    scores, population = np.array(scores), np.array(population)
    inds = np.argsort(scores)
    return list(scores[inds][::-1]), list(population[inds,:][::-1])

In [26]:
def term_frequency(population):
    tf_sent = []
    tf_dict = {}
    total_no_terms = len(population)
    for chromosome in population:
        chromo_tf = []
        indexes = np.where(chromosome!=0)
        for i in indexes[0]:
            chromo_tf.append(chromosome[i]/total_no_terms)
            tf_dict[all_terms[i]] = tf_dict.get(all_terms[i], 0) + (chromosome[i]/total_no_terms)
        tf_sent.append(chromo_tf)
    
    tf_terms = sorted(tf_dict.items(), key=lambda x: x[1], reverse=True)
    return tf_sent, tf_terms

In [27]:
def inverse_document_frequency(population):
    idf = {}
    terms = np.array(list(all_terms))
    no_documents = len(population)
    for i in range(len(all_terms)):
        k = 0
        for chromosome in population:
            indexes = np.where(chromosome!=0)
            if terms[i] in terms[indexes]:
                k += 1
        idf[terms[i]] = np.log10(no_documents/k)
    idf = sorted(idf.items(), key=lambda x: x[1], reverse=True)
    return idf

In [28]:
def term_frequency_inverse_document_frequency(population):
    tf_sent, tf_terms = term_frequency(population)
    tf_idf = {}
    idf_dict = dict(idf)
    for i in range(len(population)):
        tf_idf_sent = []
        indexes = np.where(population[i] != 0)[0]
        for j in range(len(indexes)):
            idf_term = idf_dict[all_terms[indexes[j]]]
            tf = tf_sent[i][j]
            tf_idf_sent.append(tf*idf_term)
        tf_idf[i] = sum(tf_idf_sent)/len(indexes)
    tf_idf = sorted(tf_idf.items(), key=lambda x: x[1], reverse=True)
    return tf_idf

In [29]:
def evolution(size, features_count, chromo_size,
            n_parents,
            mutation_rate1,
            mutation_rate2,
            n_gen,
            idf, idf_threshold,
            tfidf_threshold):
    best_chromo= []
    best_score= []
    
    
    selected_indexes, selected_terms = initial_population_term_selection_idf(idf, idf_threshold)
    # selected_indexes, selected_terms = initial_population_term_selection_tf(tf, tf_threshold)

    population_nextgen=generate_population(size, features_count, chromo_size, selected_indexes)
    scores, pop_after_fit = fitness_score(population_nextgen)
    population_nextgen = pop_after_fit.copy()
    
    parent_pop_size = []

    for i in range(n_gen):
        pop_after_fit = population_nextgen.copy()
        best_chromo.append(pop_after_fit[0])
        best_score.append(scores[0])
        print('Best score in generation',i+1,':',scores[0], "feat_count:", np.where(pop_after_fit[0] != 0)[0].shape)
        print("after fitness:", len(pop_after_fit))

        pop_after_sel = varient_pressure_selection(pop_after_fit, tfidf_threshold)
        # sc_sel, pop_sel = fitness_score(pop_after_sel)
        # print('Best score in generation',i+1,':',sc_sel[0], "feat_count:", np.where(pop_sel[0] != 0)[0].shape)
        print("after selection:", len(pop_after_sel))
        parent_pop_size.append(len(pop_after_sel))

        pop_after_cross = single_point_crossover1(pop_after_sel, n_parents)
        # sc_co, pop_co = fitness_score(pop_after_cross)
        # print('Best score in generation',i+1,':',sc_co[0], "feat_count:", np.where(pop_co[0] != 0)[0].shape)
        print("after crossover:", len(pop_after_cross))

        population_nextgen, scores = bit_flip_mutation1(pop_after_cross, mutation_rate1, mutation_rate2, features_count, n_parents)
        # sc_mu, pop_mu = fitness_score(population_nextgen)
        # print('Best score in generation',i+1,':',sc_mu[0], "feat_count:", np.where(pop_mu[0] != 0)[0].shape)
        
        # # new next gen population will have the evolved population + the initial population after fitness_score
        # population_nextgen += pop_after_sel
        # _, population_new_nextgen = fitness_score(population_nextgen)
        # print('Best score in generation',i+1,':',_[0], "feat_count:", np.where(population_new_nextgen[0] != 0)[0].shape)
        # population_nextgen = population_selection(population_new_nextgen, n_parents)
        print(len(population_nextgen))
        
    return best_chromo,best_score, parent_pop_size

# **Accuracy Comparison**

## **For Amazon dataset**

### Data Preprocessing

In [30]:
amazon = pd.read_csv("../../dataset/imdb_50k.csv", encoding="latin1")
frame = amazon.copy()
frame

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [31]:

X_bow, features = tokens_to_bow(frame.review, 0)
y_score = frame.sentiment
all_terms = list(features)

In [None]:
X_train, X_test, Y_train, Y_test = split(X_bow, y_score)

In [None]:
idf = inverse_document_frequency(X_bow)
tf_sent, tf_terms = term_frequency(X_bow)

### Compare models without GA

In [40]:
all_models_score_table = acc_score(X_bow, y_score)
all_models_score_table

: 

### Choosing the best classifier and starting evolution

In [None]:
logmodel = RandomForestClassifier(n_estimators=200, random_state=0)

NameError: name 'RandomForestClassifier' is not defined

In [None]:
selected_indexes, selected_terms = initial_population_term_selection_idf(idf, 2.7)
population_nextgen=generate_population(100, 100, 1553, selected_indexes)
scores, pop_after_fit = fitness_score(population_nextgen)
tf_idf_sent_score = term_frequency_inverse_document_frequency(pop_after_fit)

In [None]:
tf_idf_sent_score

[(99, 0.02288820327878031),
 (87, 0.022785495694676288),
 (95, 0.02269034280262292),
 (93, 0.022639286966115425),
 (24, 0.02249734297870615),
 (85, 0.022431211504191695),
 (41, 0.022395420792924475),
 (39, 0.02234609602090707),
 (77, 0.02233011610809874),
 (92, 0.022183129018014728),
 (91, 0.02216904832639519),
 (52, 0.022109084555617206),
 (9, 0.022032644306391774),
 (86, 0.022020048223562752),
 (80, 0.021982559643039787),
 (11, 0.021967495730708927),
 (48, 0.021961323663143625),
 (70, 0.021938405754827965),
 (50, 0.02193584094538343),
 (47, 0.021935753058094816),
 (96, 0.021931561678785724),
 (72, 0.02192453799475888),
 (55, 0.021893595036805436),
 (81, 0.021891450529990173),
 (75, 0.021868598141901233),
 (97, 0.02183105278516179),
 (13, 0.0217048376192472),
 (43, 0.02168852898074257),
 (79, 0.021682342735609653),
 (94, 0.02167028461134173),
 (73, 0.021648387591554578),
 (49, 0.02164608486801548),
 (78, 0.02162809311634894),
 (76, 0.021617490930995874),
 (30, 0.021607874857592232),
 

### Trying different parameters

In [None]:
# st = time.time()
# chromo_set_1, score_set_1 = evolution(
#     size=100, 
#     features_count=100,
#     chromo_size=X_bow.shape[1],
#     n_parents=80,
#     mutation_rate1=0.05,
#     mutation_rate2=0.1,
#     n_gen=20,
#     idf = idf,
#     idf_threshold=2.7,
#     tfidf_threshold = 0.0241
# )
# et = time.time()

In [None]:
# import pickle
# with open('single_run_az_kbga_vp.pkl', 'wb') as wf:
#     pickle.dump([chromo_set_1, score_set_1, et-st], wf)

In [None]:
# si = initial_population_term_selection_idf(idf, 2.7)[0]
# pop = generate_population(100, 100, 1553, si)

In [None]:
# try to corr map for accuracy and tfidf

In [None]:
def run_n_evolution(n):
    result_n_runs = []
    for i in range(n):
        st = time.time()
        chromo_set_2, score_set_2, pp_size = evolution(
        size=100, 
        features_count=100,
        chromo_size=X_bow.shape[1],
        n_parents=80,
        mutation_rate1=0.05,
        mutation_rate2=0.1,
        n_gen=100,
        idf = idf,
        idf_threshold=2.7,
        tfidf_threshold = 0.0240
        )
        et = time.time()
        result_n_runs.append((chromo_set_2, score_set_2, pp_size, et-st))
    return result_n_runs

In [None]:
results = run_n_evolution(30)

NameError: name 'time' is not defined

In [None]:
import pickle
with open('30_run_imdb50k_kbga_vp_ts.pkl', 'wb') as wf:
    pickle.dump(results, wf)