In [1]:
import os
import numpy as np
import pandas as pd
import csv
from csv import reader
from csv import writer
import custom_models as cm
from sklearn import tree
from scipy.stats import uniform, norm
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.feature_selection import SelectFromModel, SelectKBest, VarianceThreshold, chi2, f_classif, mutual_info_classif
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from XLB import *
import xlb_hyperparamsearch as xlbh
from apyori import apriori
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# extract data from files
x_train, y_train = extract_data("FinalTrainingSet.csv")
x_val, y_val = extract_data("Validation Set.csv")

# scale data values
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_val = scaler.transform(x_val)

# feature selection
num_features = 8
feat_sel = VarianceThreshold()
x_train = feat_sel.fit_transform(x_train)
feat_sel_2 = SelectKBest(chi2,k=num_features)
x_train = feat_sel_2.fit_transform(x_train,y_train)
# print(feat_sel_2.get_support())
x_val = feat_sel_2.transform(feat_sel.transform(x_val))

rand_seed = 3454132

oversampler = SMOTE(sampling_strategy="not majority",random_state=rand_seed)
x_smote, y_smote = oversampler.fit_resample(x_train,y_train)
# print(x_smote.shape,y_smote.shape)

oversampler = RandomOverSampler(sampling_strategy="not majority",random_state=rand_seed)
x_os, y_os = oversampler.fit_resample(x_train,y_train)
# print(x_os.shape,y_os.shape)

In [3]:
#MOVING FEATURE HEADERS INTO A LIST
import csv

f = open("FinalTrainingSet.csv")
reader = csv.reader(f)
features = next(reader)
row = list(reader)

csv_temp = pd.read_csv("FinalTrainingSet.csv")
Theme_numbered = csv_temp['Theme(Numbered)'].tolist()
Theme_numbered = np.asarray(Theme_numbered) 

# print(Theme_numbered.shape)
# x_train = np.append(x_train, Theme_numbered.reshape(Theme_numbered.shape[0], 1), axis=1)
# print(x_train.shape)

# x_train = np.delete(x_train, 69, axis=1)

#Deleting everything except features from the dataset
features.remove("Row Labels")
features.remove("Theme")
features.remove("Theme(Numbered)")
# print(len(features))
#Retained features after selection
selected_feats = feat_sel_2.get_support(True)

for ind, ft in sorted(enumerate(features), reverse=True): 
    if ind not in selected_feats:
        del features[ind]  
        
# np.append(x_train, Theme_numbered)
features.append('Theme_numbered')

new_column = pd.DataFrame({'Theme_numbered': Theme_numbered}) 
csv_temp = csv_temp.merge(new_column, left_index = True, right_index = True)

column = csv_temp.Theme_numbered

# print(features)
row_count = len(row)
f.close()
 
# print(x_train.shape)

In [None]:
emotions = ["IsCalm", "IsCheerful", "IsBravery", "IsFearful", "IsLove", "IsSadness"]
label_supp = [0.3117,  0.1372, 0.1397,  0.2469, 0.0673, 0.0973]
    
for theme in range(1,7):
    verbose = False
    num_folds = 5
    model = cm.APyoriAdapter(params={})
    targ_supp = [1 - label_supp[theme - 1],label_supp[theme - 1]]
    parameters = {
        "num_features" : num_features,
        "thresh_mean" : 0.5,
        "thresh_std" : 0.3,
        "min_support_lo" : 0.005,
        "min_support_hi" : 0.01,
#         "min_confidence_lo" : 0.036,
#         "min_confidence_hi" : 0.539,
        "min_confidence_lo" : 0.00,
        "min_confidence_hi" : 0.00,
        "col_names" : features[:-1],
        "label_names" : ["IsNot{}".format(emotions[theme-1][2:]),emotions[theme - 1]],
        "label_support" : targ_supp,
        "min_rules_lo" : 4,
        "min_rules_hi" : 8
    }
    y_targ = np.reshape(np.array([y_smote == theme]),(750,))
    hyperparams, result, model = xlbh.hyperparameter_search(
        num_folds=num_folds,model=model,parameters=parameters,X=x_smote,
        y=y_targ,verbose=True, num_iter=200,interval=10,random_state=69420
    )
    
    with open("rules{}.txt".format(emotions[theme - 1]),"w") as fOut:
        ruleset = model.ruleset
        fOut.write("{}\n".format("\n\n".join([x.__str__() for x in ruleset])))
        print("Average Interestingness: {:.2f}".format(model.evaluate()))
        probas = model.predict_proba(x_val)
        labels = model.predict(x_val)
        acc = model.score(x_val,y_val == theme)
        print("Stats for {}".format(emotions[theme - 1]))
#         print(probas)
#         print(labels,y_val == theme)
        print("Accuracy: {:.2f}".format(acc))
        print("F1-score: {}".format(f1_score(
            labels,list(map(int,y_val == theme))
        )))

Result = 0.9011176446401608
Result = 0.9024301316253851
Result = 0.9024301316253851
Result = 0.9031503239382133
Result = 0.9015840283030736
Result = 0.9035728240054514
Result = 0.9024301316253851
Result = 0.9020076315581466
Result = 0.9035728240054514
Iteration 10 / 200
Best Result: 0.90
Result = 0.9020076315581466


In [None]:
for k,v in hyperparams.items():
    print("{} -> {}".format(k,v))