In [105]:
import pickle
import numpy as np
import time
from tabulate import tabulate
from KB_evolution_utils import logmodel, X_train, Y_train, X_test, Y_test, accuracy_score, all_models_score_table

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Total Vocab Count: 1566


In [2]:
def predicted_sentiment_ratio(y_test, predictions):
    positive_percent = np.count_nonzero(predictions==1)*100//len(predictions)
    negative_percent = np.count_nonzero(predictions==0)*100//len(predictions)
    # print(f"Y_test: pos/neg percentage ~ {np.count_nonzero(y_test==1)*100//len(y_test)}, {np.count_nonzero(y_test==0)*100//len(y_test)}")
    return positive_percent, negative_percent

In [109]:
def test_accuracy(selected_genes):
    st = time.time()
    logmodel.fit(X_train[:, selected_genes], Y_train)
    et = time.time()

    predictions = logmodel.predict(X_test[:, selected_genes])
    ratio = predicted_sentiment_ratio(Y_test, predictions)
    return accuracy_score(Y_test, predictions), et-st, ratio

In [140]:
def pick_top_n_genes_n_run(chromo_set, n=1):
    common_index = dict()
    chromo_len_in_each_gen = []
    for chromo in chromo_set:
        if n==1:
            chromo_len_in_each_gen.append(np.count_nonzero(chromo == 1))
            for i in range(len(chromo)):
                if chromo[i] == 1:
                    common_index[i] = common_index.setdefault(i, 0) + 1
        else:
            best_gen = np.array(chromo[0][0])
            chromo_len_in_each_gen.append(np.count_nonzero(best_gen != 0))
            for i in range(best_gen.shape[0]):
                if best_gen[i] != 0:
                    common_index[i] = common_index.setdefault(i, 0) + 1

    sorted_common_index = np.array(sorted(
        common_index.items(), key=lambda x: x[1], reverse=True))
    return sorted_common_index, np.array(chromo_len_in_each_gen)

# Collect Common indexes from res of n run evolution


# Final Results

In [156]:
# def tabulate_results(chromo_set, n):
#     genes = [30, 50, 60, 100, 150, 200, 250, -1]
#     accuracy = []
#     time_to_train = []
#     sentiment_ratio = []
#     for i in genes:
#         top_i_common_indexes, chromo_len = pick_top_n_genes_n_run(i, chromo_set, n)
#         top_n_genes = top_i_common_indexes[:, 0]
#         common_index_count = top_i_common_indexes[:, 1]
#         acc, ttt, sr = test_accuracy(top_n_genes)
#         accuracy.append(acc)
#         time_to_train.append(ttt)
#         sentiment_ratio.append(sr)

    
#     table_data = {'Genes': genes,
#                 'Accuracy': accuracy,
#                 'Time Taken to Train': time_to_train,
#                 'Sentiment Ratio(p/n)': sentiment_ratio
#                 }
#     print(top_n_genes)
#     print('Actual chromosome length in X_Test: ', X_test.shape[1])
#     print('RandomForest with full length -', all_models_score_table.iloc[0, 1:])
#     print('Total Common genes count ~(-1): ', top_i_common_indexes)
#     print(chromo_len)
#     print(tabulate(table_data, headers='keys', tablefmt="simple_grid"))
#     return table_data, top_n_genes, top_i_common_indexes


In [161]:
def tabulate_results(chromo_set, n):
    # genes = [30, 50, 60, 100, 150, 200, 250, -1]
    accuracy = []
    time_to_train = []
    sentiment_ratio = []
    top_n_genes = []

    top_i_common_indexes, chromo_len = pick_top_n_genes_n_run(chromo_set, n)
    common_indexes_count = top_i_common_indexes[:, 1]
    count_thresholds = set(common_indexes_count)

    for i in count_thresholds:
        for index, count in top_i_common_indexes:
            if count >= i:
                top_n_genes.append(index)

        acc, ttt, sr = test_accuracy(top_n_genes)
        accuracy.append(acc)
        time_to_train.append(ttt)
        sentiment_ratio.append(sr)

    
    table_data = {'Count Threshold': count_thresholds,
                'Accuracy': accuracy,
                'Time Taken to Train': time_to_train,
                'Sentiment Ratio(p/n)': sentiment_ratio
                }
    print('Actual chromosome length in X_Test: ', X_test.shape[1])
    print('RandomForest with full length -', all_models_score_table.iloc[0, 1:])
    print('Total Common genes count ~(-1): ', top_i_common_indexes)
    print(chromo_len)
    print(tabulate(table_data, headers='keys', tablefmt="simple_grid"))
    return table_data, top_n_genes, top_i_common_indexes


In [162]:
with open('pickles/kbga/amazon/n_run_az_kb_co.pkl', 'rb') as rf:
    n_kb_co = pickle.load(rf)
    n_run_kb_co_table = tabulate_results(n_kb_co, 30)
    n_run_kb_co_table

Actual chromosome length in X_Test:  1566
RandomForest with full length - Accuracy             0.796
Exec_Time_secs    1.323454
Name: 0, dtype: object
Total Common genes count ~(-1):  [[1064    8]
 [ 407    7]
 [1141    7]
 ...
 [ 900    1]
 [1163    1]
 [1208    1]]
[100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100
 100 100 100 100 100 100 100 100 100 226 100 100]
┌───────────────────┬────────────┬───────────────────────┬────────────────────────┐
│   Count Threshold │   Accuracy │   Time Taken to Train │ Sentiment Ratio(p/n)   │
├───────────────────┼────────────┼───────────────────────┼────────────────────────┤
│                 1 │      0.812 │               1.26099 │ (47, 52)               │
├───────────────────┼────────────┼───────────────────────┼────────────────────────┤
│                 2 │      0.804 │               1.71281 │ (48, 52)               │
├───────────────────┼────────────┼───────────────────────┼────────────────────────┤
│                 3 │

In [155]:
with open('pickles/kbga/amazon/n_run_az_kbga.pkl', 'rb') as rf:
    n_kbga = pickle.load(rf)
    n_run_kbga_table = tabulate_results(n_kbga, 30)
    n_run_kbga_table

Actual chromosome length in X_Test:  1566
RandomForest with full length - Accuracy             0.796
Exec_Time_secs    1.323454
Name: 0, dtype: object
Total Common genes count ~(-1):  [[ 58   9]
 [116   9]
 [868   9]
 ...
 [239   1]
 [263   1]
 [890   1]]
[100 100 100 100 327 100 100 100 100 100 100 100 100 100 100 100 100 100
 100 100 100 100 100 100 100 190 100 100 100 100]
┌───────────────────┬────────────┬───────────────────────┬────────────────────────┐
│   Count Threshold │   Accuracy │   Time Taken to Train │ Sentiment Ratio(p/n)   │
├───────────────────┼────────────┼───────────────────────┼────────────────────────┤
│                 1 │      0.676 │               1.00728 │ (54, 45)               │
├───────────────────┼────────────┼───────────────────────┼────────────────────────┤
│                 2 │      0.664 │               1.53589 │ (54, 46)               │
├───────────────────┼────────────┼───────────────────────┼────────────────────────┤
│                 3 │      0.676 