
# Imports


In [1]:

# for data manipulation
import pandas as pd
#pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
import numpy as np
import string
import re
#import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# for feature extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# for feature selection
from sklearn.feature_selection import mutual_info_classif
#!pip install deap
from deap import base
from deap import creator
from deap import tools

# for classification
from sklearn.linear_model import LogisticRegression
from sklearn import svm
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.tree import DecisionTreeClassifier

# for cross-validation
from sklearn.model_selection import KFold

# for accuracy calculation
from sklearn import metrics

# for measuring execution times
import time

import random



# Dataset


In [2]:

# Read Stanford Sentiment Treebank

path = 'stanfordSentimentTreebank/'

sentiment_labels = pd.read_csv(path + 'sentiment_labels.txt', sep = '|')
sentence_ids = pd.read_csv(path + 'datasetSentences.txt', sep = '\t')
dictionary = pd.read_csv(path + 'dictionary.txt', sep = '|', names = ['phrase', 'phrase ids'])
train_test_split = pd.read_csv(path + 'datasetSplit.txt')
sentence_phrase_merge = pd.merge(sentence_ids, dictionary, left_on = 'sentence', right_on = 'phrase')
sentence_phrase_split = pd.merge(sentence_phrase_merge, train_test_split, on = 'sentence_index')
df = pd.merge(sentence_phrase_split, sentiment_labels, on = 'phrase ids').sample(frac = 1)

df.head()


Unnamed: 0,sentence_index,sentence,phrase,phrase ids,splitset_label,sentiment values
1868,1947,Paul Bettany is cool .,Paul Bettany is cool .,225416,1,0.73611
136,138,"Run , do n't walk , to see this barbed and bracing comedy on the big screen .","Run , do n't walk , to see this barbed and bracing comedy on the big screen .",14404,2,0.90278
6240,6531,"Borstal Boy represents the worst kind of filmmaking , the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy .","Borstal Boy represents the worst kind of filmmaking , the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy .",104817,1,0.041667
4005,4188,"It is a strength of a documentary to disregard available bias , especially as temptingly easy as it would have been with this premise .","It is a strength of a documentary to disregard available bias , especially as temptingly easy as it would have been with this premise .",66732,1,0.66667
6954,7277,Thumbs down .,Thumbs down .,226746,1,0.097222


In [3]:

# Assign sentiment class according to scores
df['sentiment'] = 0
df.loc[df['sentiment values'] < 0.5, 'sentiment'] = -1 # negative
df.loc[df['sentiment values'] > 0.5, 'sentiment'] = 1 # positive

df.head()


Unnamed: 0,sentence_index,sentence,phrase,phrase ids,splitset_label,sentiment values,sentiment
1868,1947,Paul Bettany is cool .,Paul Bettany is cool .,225416,1,0.73611,1
136,138,"Run , do n't walk , to see this barbed and bracing comedy on the big screen .","Run , do n't walk , to see this barbed and bracing comedy on the big screen .",14404,2,0.90278,1
6240,6531,"Borstal Boy represents the worst kind of filmmaking , the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy .","Borstal Boy represents the worst kind of filmmaking , the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy .",104817,1,0.041667,-1
4005,4188,"It is a strength of a documentary to disregard available bias , especially as temptingly easy as it would have been with this premise .","It is a strength of a documentary to disregard available bias , especially as temptingly easy as it would have been with this premise .",66732,1,0.66667,1
6954,7277,Thumbs down .,Thumbs down .,226746,1,0.097222,-1


In [4]:

# Size of each sentiment class
df['sentiment'].value_counts()


 1    5642
-1    5385
 0     259
Name: sentiment, dtype: int64

In [5]:

# Remove neutral tagged sentences
mask = (df['sentiment'] == 1) | (df['sentiment'] == -1)
df = df[mask]

df.head()


Unnamed: 0,sentence_index,sentence,phrase,phrase ids,splitset_label,sentiment values,sentiment
1868,1947,Paul Bettany is cool .,Paul Bettany is cool .,225416,1,0.73611,1
136,138,"Run , do n't walk , to see this barbed and bracing comedy on the big screen .","Run , do n't walk , to see this barbed and bracing comedy on the big screen .",14404,2,0.90278,1
6240,6531,"Borstal Boy represents the worst kind of filmmaking , the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy .","Borstal Boy represents the worst kind of filmmaking , the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy .",104817,1,0.041667,-1
4005,4188,"It is a strength of a documentary to disregard available bias , especially as temptingly easy as it would have been with this premise .","It is a strength of a documentary to disregard available bias , especially as temptingly easy as it would have been with this premise .",66732,1,0.66667,1
6954,7277,Thumbs down .,Thumbs down .,226746,1,0.097222,-1


In [6]:

# Size of each sentiment class
df['sentiment'].value_counts()


 1    5642
-1    5385
Name: sentiment, dtype: int64

In [7]:

# Split the data into specified train, test and dev sets

train_set = df[df["splitset_label"] == 1]
train_set.head()


Unnamed: 0,sentence_index,sentence,phrase,phrase ids,splitset_label,sentiment values,sentiment
1868,1947,Paul Bettany is cool .,Paul Bettany is cool .,225416,1,0.73611,1
6240,6531,"Borstal Boy represents the worst kind of filmmaking , the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy .","Borstal Boy represents the worst kind of filmmaking , the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy .",104817,1,0.041667,-1
4005,4188,"It is a strength of a documentary to disregard available bias , especially as temptingly easy as it would have been with this premise .","It is a strength of a documentary to disregard available bias , especially as temptingly easy as it would have been with this premise .",66732,1,0.66667,1
6954,7277,Thumbs down .,Thumbs down .,226746,1,0.097222,-1
3985,4165,A wild comedy that could only spring from the demented mind of the writer of Being John Malkovich .,A wild comedy that could only spring from the demented mind of the writer of Being John Malkovich .,63568,1,0.73611,1


In [8]:

test_set = df[df["splitset_label"] == 2]
test_set.head()


Unnamed: 0,sentence_index,sentence,phrase,phrase ids,splitset_label,sentiment values,sentiment
136,138,"Run , do n't walk , to see this barbed and bracing comedy on the big screen .","Run , do n't walk , to see this barbed and bracing comedy on the big screen .",14404,2,0.90278,1
375,385,"Renner 's performance as Dahmer is unforgettable , deeply absorbing .","Renner 's performance as Dahmer is unforgettable , deeply absorbing .",19220,2,0.91667,1
8579,8979,Its underlying mythology is a hodgepodge of inconsistencies that pose the question : Since when did dumb entertainment have to be this dumb ?,Its underlying mythology is a hodgepodge of inconsistencies that pose the question : Since when did dumb entertainment have to be this dumb ?,224421,2,0.069444,-1
622,646,"Workmanlike , maybe , but still a film with all the elements that made the other three great , scary times at the movies .","Workmanlike , maybe , but still a film with all the elements that made the other three great , scary times at the movies .",27268,2,0.80556,1
263,267,"Windtalkers is shapelessly gratifying , the kind of movie that invites you to pick apart its faults even as you have to admit that somehow it hit you where you live .","Windtalkers is shapelessly gratifying , the kind of movie that invites you to pick apart its faults even as you have to admit that somehow it hit you where you live .",19437,2,0.69444,1


In [9]:

dev_set = df[df["splitset_label"] == 3]
dev_set.head()


Unnamed: 0,sentence_index,sentence,phrase,phrase ids,splitset_label,sentiment values,sentiment
7353,7706,... think of it as American Pie On Valium .,... think of it as American Pie On Valium .,142819,3,0.56944,1
1308,1362,It is great summer fun to watch Arnold and his buddy Gerald bounce off a quirky cast of characters .,It is great summer fun to watch Arnold and his buddy Gerald bounce off a quirky cast of characters .,45815,3,0.79167,1
1141,1184,"Not far beneath the surface , this reconfigured tale asks disturbing questions about those things we expect from military epics .","Not far beneath the surface , this reconfigured tale asks disturbing questions about those things we expect from military epics .",26225,3,0.61111,1
1574,1640,"It 's so good that its relentless , polished wit can withstand not only inept school productions , but even Oliver Parker 's movie adaptation .","It 's so good that its relentless , polished wit can withstand not only inept school productions , but even Oliver Parker 's movie adaptation .",224238,3,0.79167,1
7172,7507,"It 's a cookie-cutter movie , a cut-and-paste job .","It 's a cookie-cutter movie , a cut-and-paste job .",146401,3,0.33333,-1


In [10]:

train_X = train_set['sentence'].tolist()
train_y = train_set['sentiment'].tolist()

test_X = test_set['sentence'].tolist()
test_y = test_set['sentiment'].tolist()

print('Training Features Shape:', len(train_X), ',', len(train_X[0]))
print('Training Labels Shape:', len(train_y))
print('Testing Features Shape:', len(test_X), ',', len(test_X[0]))
print('Testing Labels Shape:', len(test_y))


Training Features Shape: 7920 , 22
Training Labels Shape: 7920
Testing Features Shape: 2082 , 77
Testing Labels Shape: 2082


#### Statistics before preprocessing

In [11]:
# statistics before preprocessing

# Keep sentences separately
sentences = pd.DataFrame(df['sentence'].copy())

sentences.head()


Unnamed: 0,sentence
1868,Paul Bettany is cool .
136,"Run , do n't walk , to see this barbed and bracing comedy on the big screen ."
6240,"Borstal Boy represents the worst kind of filmmaking , the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy ."
4005,"It is a strength of a documentary to disregard available bias , especially as temptingly easy as it would have been with this premise ."
6954,Thumbs down .


In [12]:
# statistics before preprocessing

# Extract features with Bag of Words
bp_count_vec = CountVectorizer(ngram_range = (1,1))

bp_count_data = bp_count_vec.fit_transform(sentences['sentence'])

bp_df_bow = pd.DataFrame(bp_count_data.toarray(), columns = bp_count_vec.get_feature_names())

bp_df_bow.head()


Unnamed: 0,00,000,007,10,100,101,102,103,105,10th,...,ziyi,zoe,zombie,zombies,zone,zoning,zoolander,zoom,zwick,zzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# statistics before preprocessing

print('Number of unique words in BoW representation: ', len(bp_df_bow.columns))


Number of unique words in BoW representation:  16987


In [14]:
# statistics before preprocessing

print('Simple statistics of the words in the sentences:')

bp_df_bow_word_counts = bp_df_bow.sum(axis = 1)

bp_df_bow_word_counts.describe()


Simple statistics of the words in the sentences:


count    11027.000000
mean        16.131133
std          8.168296
min          1.000000
25%         10.000000
50%         15.000000
75%         22.000000
max         50.000000
dtype: float64


# Preprocess


In [15]:

# Convert all words to lowercase
df['sentence'] = df['sentence'].str.lower()

df.head()


Unnamed: 0,sentence_index,sentence,phrase,phrase ids,splitset_label,sentiment values,sentiment
1868,1947,paul bettany is cool .,Paul Bettany is cool .,225416,1,0.73611,1
136,138,"run , do n't walk , to see this barbed and bracing comedy on the big screen .","Run , do n't walk , to see this barbed and bracing comedy on the big screen .",14404,2,0.90278,1
6240,6531,"borstal boy represents the worst kind of filmmaking , the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy .","Borstal Boy represents the worst kind of filmmaking , the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy .",104817,1,0.041667,-1
4005,4188,"it is a strength of a documentary to disregard available bias , especially as temptingly easy as it would have been with this premise .","It is a strength of a documentary to disregard available bias , especially as temptingly easy as it would have been with this premise .",66732,1,0.66667,1
6954,7277,thumbs down .,Thumbs down .,226746,1,0.097222,-1


In [16]:

# Remove punctuation
df['sentence'] = df['sentence'].str.translate(str.maketrans('', '', string.punctuation))

df.head()


Unnamed: 0,sentence_index,sentence,phrase,phrase ids,splitset_label,sentiment values,sentiment
1868,1947,paul bettany is cool,Paul Bettany is cool .,225416,1,0.73611,1
136,138,run do nt walk to see this barbed and bracing comedy on the big screen,"Run , do n't walk , to see this barbed and bracing comedy on the big screen .",14404,2,0.90278,1
6240,6531,borstal boy represents the worst kind of filmmaking the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy,"Borstal Boy represents the worst kind of filmmaking , the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy .",104817,1,0.041667,-1
4005,4188,it is a strength of a documentary to disregard available bias especially as temptingly easy as it would have been with this premise,"It is a strength of a documentary to disregard available bias , especially as temptingly easy as it would have been with this premise .",66732,1,0.66667,1
6954,7277,thumbs down,Thumbs down .,226746,1,0.097222,-1


In [17]:

# Tokenize the sentences
df['sentence'] = df['sentence'].apply(lambda x: word_tokenize(x))

df.head()


Unnamed: 0,sentence_index,sentence,phrase,phrase ids,splitset_label,sentiment values,sentiment
1868,1947,"[paul, bettany, is, cool]",Paul Bettany is cool .,225416,1,0.73611,1
136,138,"[run, do, nt, walk, to, see, this, barbed, and, bracing, comedy, on, the, big, screen]","Run , do n't walk , to see this barbed and bracing comedy on the big screen .",14404,2,0.90278,1
6240,6531,"[borstal, boy, represents, the, worst, kind, of, filmmaking, the, kind, that, pretends, to, be, passionate, and, truthful, but, is, really, frustratingly, timid, and, soggy]","Borstal Boy represents the worst kind of filmmaking , the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy .",104817,1,0.041667,-1
4005,4188,"[it, is, a, strength, of, a, documentary, to, disregard, available, bias, especially, as, temptingly, easy, as, it, would, have, been, with, this, premise]","It is a strength of a documentary to disregard available bias , especially as temptingly easy as it would have been with this premise .",66732,1,0.66667,1
6954,7277,"[thumbs, down]",Thumbs down .,226746,1,0.097222,-1


In [18]:

# Filter out stop words
stop_words = set(stopwords.words('english'))
df['sentence'] = df['sentence'].apply(lambda x: [w for w in x if not w in stop_words])

df.head()


Unnamed: 0,sentence_index,sentence,phrase,phrase ids,splitset_label,sentiment values,sentiment
1868,1947,"[paul, bettany, cool]",Paul Bettany is cool .,225416,1,0.73611,1
136,138,"[run, nt, walk, see, barbed, bracing, comedy, big, screen]","Run , do n't walk , to see this barbed and bracing comedy on the big screen .",14404,2,0.90278,1
6240,6531,"[borstal, boy, represents, worst, kind, filmmaking, kind, pretends, passionate, truthful, really, frustratingly, timid, soggy]","Borstal Boy represents the worst kind of filmmaking , the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy .",104817,1,0.041667,-1
4005,4188,"[strength, documentary, disregard, available, bias, especially, temptingly, easy, would, premise]","It is a strength of a documentary to disregard available bias , especially as temptingly easy as it would have been with this premise .",66732,1,0.66667,1
6954,7277,[thumbs],Thumbs down .,226746,1,0.097222,-1


In [19]:

# Concatenate tokens
df['sentence'] = df['sentence'].apply(lambda x: ' '.join(x))

df.head()


Unnamed: 0,sentence_index,sentence,phrase,phrase ids,splitset_label,sentiment values,sentiment
1868,1947,paul bettany cool,Paul Bettany is cool .,225416,1,0.73611,1
136,138,run nt walk see barbed bracing comedy big screen,"Run , do n't walk , to see this barbed and bracing comedy on the big screen .",14404,2,0.90278,1
6240,6531,borstal boy represents worst kind filmmaking kind pretends passionate truthful really frustratingly timid soggy,"Borstal Boy represents the worst kind of filmmaking , the kind that pretends to be passionate and truthful but is really frustratingly timid and soggy .",104817,1,0.041667,-1
4005,4188,strength documentary disregard available bias especially temptingly easy would premise,"It is a strength of a documentary to disregard available bias , especially as temptingly easy as it would have been with this premise .",66732,1,0.66667,1
6954,7277,thumbs,Thumbs down .,226746,1,0.097222,-1


In [20]:

# Keep sentences separately
sentences = pd.DataFrame(df['sentence'].copy())

sentences.head()


Unnamed: 0,sentence
1868,paul bettany cool
136,run nt walk see barbed bracing comedy big screen
6240,borstal boy represents worst kind filmmaking kind pretends passionate truthful really frustratingly timid soggy
4005,strength documentary disregard available bias especially temptingly easy would premise
6954,thumbs


In [21]:

len(sentences)


11027

In [22]:

# Keep sentiment classes separately
labels = pd.DataFrame(df['sentiment'].copy())

labels.head()


Unnamed: 0,sentiment
1868,1
136,1
6240,-1
4005,1
6954,-1


In [23]:

# Keep split set (train, test, dev) labels separately
split_labels = pd.DataFrame(df['splitset_label'].copy())

split_labels.head()


Unnamed: 0,splitset_label
1868,1
136,2
6240,1
4005,1
6954,1



# Feature Extraction


In [24]:

# Extract features with Bag of Words
count_vec = CountVectorizer(ngram_range = (1,1))

count_data = count_vec.fit_transform(sentences['sentence'])

df_bow = pd.DataFrame(count_data.toarray(), columns = count_vec.get_feature_names())

df_bow.head()


Unnamed: 0,007,10,100,10000,100minute,100year,101,102minute,103minute,105,...,zoe,zombie,zombieland,zombies,zone,zoning,zoolander,zoom,zwick,zzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Statistics after preprocessing

In [25]:

# statistics after preprocessing
print('Number of unique words in BoW representation: ', len(df_bow.columns))


Number of unique words in BoW representation:  18296


In [26]:

# statistics after preprocessing
print('Simple statistics of the words in the sentences:')

df_bow_word_counts = df_bow.sum(axis = 1)

df_bow_word_counts.describe()


Simple statistics of the words in the sentences:


count    11027.000000
mean         9.342795
std          4.668867
min          0.000000
25%          6.000000
50%          9.000000
75%         12.000000
max         28.000000
dtype: float64

In [27]:

print('Different tokens between before and after preprocessing:')

print(set(df_bow.columns)-set(bp_df_bow.columns))


Different tokens between before and after preprocessing:
{'illequipped', 'sitcomworthy', 'independentcommunity', 'soberminded', 'hardpressed', 'reallive', 'lifeatarm', 'ennuihobbled', 'romanticcomedy', 'hollywoodpredictable', 'nearmasterpiece', 'derringdo', 'over25s', 'wideranging', 'roundrobin', 'modernoffice', 'tearstained', 'daytimedrama', '15year', 'portentheavy', 'wellcontructed', 'somethingborrowed', 'oscarsize', 'prepschool', 'blooddrenched', 'lowlife', 'cutandpaste', 'wellwritten', '10course', 'selfloathing', 'eyeopening', 'majorityoriented', 'drawnout', 'englishlanguage', 'longrunning', 'timehonored', 'whitetrash', 'snazzylooking', 'postwar', 'throatsinging', 'africanamerican', 'notveryfunny', 'pussyass', 'starmaking', 'antisemitism', 'sobadit', 'periodperfect', '49yearold', 'gothvampire', ...}



#### Run the following two cells to extract features with GloVe instead of Bag-Of-Words

In [None]:

# Load GloVe embeddings into a dictionary

# To download GloVe: https://nlp.stanford.edu/projects/glove/

dimension = 50 # 100 200 300

glove_embeddings = {}

f = open('GloVe/glove.6B.' + str(dimension) + 'd.txt', 'r', encoding = 'utf8')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    glove_embeddings[word] = coefs
    
f.close()

print('Found %s word vectors in GloVe.' % len(glove_embeddings))


In [None]:

# third quartile of the word counts in the sentences
word_count = 12

# Extract features with GloVe

feature_count = dimension * word_count

df_glove = []

empty_count = 0

words_not_found = set()
unique_words = set()

for sentence in sentences['sentence']:
    tokens = sentence.lower().split()
    vecs = []
    
    found = False
    
    for index, word in enumerate(tokens):
        
        unique_words.add(word)
        
        if index == word_count:
            break
        
        try:
            # throws KeyError if word not found
            vec = glove_embeddings[word]
            found = True
            
        except KeyError:
            
            vec = [0 for i in range(dimension)]
            
            words_not_found.add(word)
            
        
        vecs.extend(vec)
    
    if len(tokens) < word_count:
        padding = [0 for i in range(len(tokens)*dimension, word_count*dimension)]
        vecs.extend(padding)
    
    if not found:
        empty_count += 1
    
    df_glove.append(vecs)

print(len(df_glove), len(df_glove[1]))

print("Number of sentences with no words found: %s / %s" % (empty_count, len(sentences)))
print("Number of words that are not found: %s / %s" % (len(words_not_found), len(unique_words)))



# Classification


In [28]:

# Apply classification using Logistic Regression

def logisticRegression(train_X, test_X, train_y, test_y):

    # Create a Logistic Regression classifier
    clf = LogisticRegression(random_state = 0, solver = 'lbfgs', multi_class = 'ovr', max_iter = 1000)

    # Train the model using the training set
    clf.fit(train_X, train_y)

    # Predict the classes for the test set
    pred_y = clf.predict(test_X)

    # Calculate accuracy of the model
    accuracy = metrics.accuracy_score(test_y, pred_y)

    return accuracy


In [29]:

# Apply classification using Support Vector Machines

def supportVectorMachines(train_X, test_X, train_y, test_y):

    # Create a Support Vector Machines classifier
    clf = svm.LinearSVC(random_state = 0, C = 0.1)

    # Train the model using the training set
    clf.fit(train_X, train_y)

    # Predict the classes for the test set
    pred_y = clf.predict(test_X)

    # Calculate accuracy of the model
    accuracy = metrics.accuracy_score(test_y, pred_y)

    return accuracy


# Experiments

In [30]:

# Prepare instances and classes for classification

classes = np.array(labels['sentiment'].tolist())

instances = np.array(df_bow.values.tolist())

# comment out the previous line and use the following line if feature extraction technique is GloVe
#instances = np.array(df_glove)

feature_names = [i for i in range(instances.shape[1])]

train_test_dev_indexes = np.array(split_labels['splitset_label'].tolist())

print(instances[0:5])
print(classes[0:5])
print(feature_names[0:5])
print(train_test_dev_indexes[0:5])


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[ 1  1 -1  1 -1]
[0, 1, 2, 3, 4]
[1 2 1 1 1]


In [31]:

print('Number of features: ', instances.shape[1])


Number of features:  18296


In [32]:

# Separate the train and test data

train_X = []
train_y = []

test_X = []
test_y = []

for tt, ins, cla in zip(train_test_dev_indexes, instances, classes):
    if tt == 1:
        train_X.append(ins)
        train_y.append(cla)
    elif tt == 2:
        test_X.append(ins)
        test_y.append(cla)

train_X = np.array(train_X)
train_y = np.array(train_y)
test_X = np.array(test_X)
test_y = np.array(test_y)

print('Training Features Shape:', len(train_X), ',', len(train_X[0]))
print('Training Labels Shape:', len(train_y))
print('Testing Features Shape:', len(test_X), ',', len(test_X[0]))
print('Testing Labels Shape:', len(test_y))


Training Features Shape: 7920 , 18296
Training Labels Shape: 7920
Testing Features Shape: 2082 , 18296
Testing Labels Shape: 2082



#### Baseline results


In [33]:

# Measure execution time
start_time = time.time()

# Apply Logistic Regression
accuracy = logisticRegression(train_X, test_X, train_y, test_y)

end_time = time.time()

print('Accuracy of Logistic Regression: ', round(accuracy, 4))
print('Execution time of Logistic Regression: %s seconds' % round(end_time - start_time, 1))


Accuracy of Logistic Regression:  0.7608
Execution time of Logistic Regression: 7.9 seconds


In [34]:

# Measure execution time
start_time = time.time()

# Apply Support Vector Machines
accuracy = supportVectorMachines(train_X, test_X, train_y, test_y)

end_time = time.time()

print('Accuracy of Support Vector Machines: ', round(accuracy, 4))
print('Execution time of Support Vector Machines: %s seconds' % round(end_time - start_time, 1))


Accuracy of Support Vector Machines:  0.7546
Execution time of Support Vector Machines: 1.1 seconds



# Feature Selection



### Information Gain


In [35]:

ig_result = dict(zip(feature_names, mutual_info_classif(instances, classes, discrete_features=True)))

ig_result_sorted = dict(sorted(ig_result.items(), reverse = True, key = lambda item: item[1]))

print(ig_result_sorted)


{1322: 0.005982830031997247, 10903: 0.004268581404249272, 1620: 0.0026826504802843518, 1961: 0.0024620563137700722, 11647: 0.0024010776550344093, 4892: 0.002227717367015601, 7383: 0.0022017218387869898, 14894: 0.002140861579154404, 10447: 0.0020933465800208467, 11646: 0.0020322910122616026, 9311: 0.0020210884385322373, 5275: 0.0018734985813703073, 10040: 0.0018644678924421812, 6527: 0.0018401284453354967, 12133: 0.0018148203344258802, 6129: 0.0017608958917198839, 9516: 0.0017154961781554526, 1499: 0.0017031035270066415, 17102: 0.0016928592596844507, 10431: 0.0016660410791939848, 5853: 0.0016588806459843812, 12264: 0.0016259637890139577, 6025: 0.0016121358755491182, 5931: 0.0016111362772898586, 12452: 0.0015826731721093595, 18122: 0.0015788180118037252, 18127: 0.0015730876326616491, 8986: 0.0015068725569925634, 1500: 0.0015051526447373716, 5311: 0.0014955848246479848, 18058: 0.0014907655168604973, 16513: 0.0014802731543871748, 2638: 0.0014772756439185913, 5801: 0.0014609948166707729, 12

In [36]:

# Find quartiles in terms of informatin gain results

quartile = np.percentile(list(ig_result.values()), [25, 50, 75]) # Q1, Q2, Q3

print('Quartiles in terms of information gain results', quartile)


Quartiles in terms of information gain results [6.07736900e-05 6.50019962e-05 8.12122564e-05]


In [37]:

# Find the features whose informatin gain values are lower than the specified quartile value

selected_features = []

for i, v in enumerate(ig_result.values()):
    if v >= quartile[1]:
        selected_features.append(i)


In [38]:

# Remove the features whose informatin gain values are lower than the specified quartile value

instances = instances[:, selected_features]
train_X = train_X[:, selected_features]
test_X = test_X[:, selected_features]


In [39]:

print('Training Features Shape:', len(train_X), ',', len(train_X[0]))
print('Training Labels Shape:', len(train_y))
print('Testing Features Shape:', len(test_X), ',', len(test_X[0]))
print('Testing Labels Shape:', len(test_y))


Training Features Shape: 7920 , 9434
Training Labels Shape: 7920
Testing Features Shape: 2082 , 9434
Testing Labels Shape: 2082



#### Classification results after feature selection with Information Gain Filtering


In [40]:

# Measure execution time
start_time = time.time()

# Apply Logistic Regression
accuracy = logisticRegression(train_X, test_X, train_y, test_y)

end_time = time.time()

print('Accuracy of Logistic Regression after information gain: ', round(accuracy, 4))
print('Execution time of Logistic Regression after information gain: %s seconds' % round(end_time - start_time, 1))


Accuracy of Logistic Regression after information gain:  0.8031
Execution time of Logistic Regression after information gain: 5.5 seconds


In [41]:

# Measure execution time
start_time = time.time()

# Apply Support Vector Machines
accuracy = supportVectorMachines(train_X, test_X, train_y, test_y)

end_time = time.time()

print('Accuracy of Support Vector Machines after information gain: ', round(accuracy, 4))
print('Execution time of Support Vector Machines after information gain: %s seconds' % round(end_time - start_time, 1))


Accuracy of Support Vector Machines after information gain:  0.8031
Execution time of Support Vector Machines after information gain: 0.9 seconds



### NSGA-II


In [42]:

# Filter out the unselected features from data according to the chromosome (individual)

def createDatasetFromIndividual(individual):
    
    selected_features = [i for i, j in zip(range(len(individual)), individual) if j == 1]
    
    temp_train_X = train_X[:, selected_features]
    temp_test_X = test_X[:, selected_features]
    temp_train_y = train_y[:]
    temp_test_y = test_y[:]
    
    return temp_train_X, temp_test_X, temp_train_y, temp_test_y


In [43]:

# Generate a chromosome: 1 indicates a selected feature

def generateIndividual():
    ind = [random.randint(0, 1) for i in range(instances.shape[1])]
    
    if sum(ind) == 0:
        return generateIndividual()
    
    return np.array(ind, dtype = int)


In [44]:

# Half uniform crossover

def HUXcrossover(ind1, ind2):
    
    for i in range(len(ind1)):
        if ind1[i] != ind2[i]:
            ind1[i] = random.randint(0, 1)
            ind2[i] = random.randint(0, 1)

    return ind1, ind2


In [45]:

# Evaluate an individual with the specified classifier
# 1. prepare dataset according to the selected features
# 2. calculate the objective values of the chromosome (number of features and accuracy)

def evaluate(individual, classifier):
    
    global all_evaluations
    
    individual_str = "".join([str(i) for i in individual])
    
    if individual_str in all_evaluations:
        number_of_features, accuracy = all_evaluations[individual_str][0], all_evaluations[individual_str][1]
        return number_of_features, accuracy
    
    number_of_features = sum(individual)
    
    if number_of_features == 0:
        return 100000, -100
    
    cur_train_X, cur_test_X, cur_train_y, cur_test_y = createDatasetFromIndividual(individual)
    
    accuracy = 0
    
    if classifier == 'LR':
        accuracy = logisticRegression(cur_train_X, cur_test_X, cur_train_y, cur_test_y)
    elif classifier == 'SVM':
        accuracy = supportVectorMachines(cur_train_X, cur_test_X, cur_train_y, cur_test_y)
    
    all_evaluations[individual_str] = (number_of_features, accuracy)
    
    #print(number_of_features, accuracy)
    
    return number_of_features, accuracy


In [46]:

# Non-dominated Sorting Genetic Algorithm

def NSGA(classifier, number_of_generations, write_path):
    
    global pop
    
    CXPB, MUTPB, NGEN = 1.0, 1.0, number_of_generations+1
    
    #print(pop)
    
    # Evaluate the entire population
    fitnesses = map(toolbox.evaluate, pop, [classifier for _ in range(len(pop))])
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit
        #print(ind, fit)
    
    print('Initial population is ready.')
    
    for g in range(NGEN):
        
        print(g, end = ", ")

        # Non-dominated sort the population
        offspring = toolbox.select(pop, len(pop))
        
        if g % 10 == 0:
            writer = open(write_path + 'SST_' + str(POP_SIZE) + "p_" + str(g) + "g_" + classifier + '.txt', 'w')

            writer.write("unique # of evaluations = " + str(len(all_evaluations)) + "\n\n")
            writer.write("Non-dominated solutions:\n")
            for ind in offspring:
                writer.write("".join([str(i) for i in ind]) + "\t" + "%d" % ind.fitness.values[0] + "\t" + "%.3f" % ind.fitness.values[1] + "\n")
            writer.close()
            
            if g == NGEN-1:
                break
            
        # Clone the selected individuals
        offspring = list(map(toolbox.clone, offspring))
        
        # Apply crossover and mutation on the offspring
        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            if random.random() < CXPB:
                toolbox.mate(child1, child2)
                del child1.fitness.values
                del child2.fitness.values

        for mutant in offspring:
            if random.random() < MUTPB:
                toolbox.mutate(mutant)
                del mutant.fitness.values

        # Evaluate the generated individuals
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = map(toolbox.evaluate, invalid_ind, [classifier for _ in range(len(invalid_ind))])
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit

        # Add the new individuals to the population, and
        # select the better half (elitism)
        pop.extend(offspring)
        pop = toolbox.select(pop, POP_SIZE)
    
    nondominated_solutions = dict()
    
    # find non-dominated individuals
    for ind in pop:
        
        isDominated = False
        nof = ind.fitness.values[0]
        acc = ind.fitness.values[1]
        
        for ind2 in pop:
            nof2 = ind2.fitness.values[0]
            acc2 = ind2.fitness.values[1]
            
            if (nof2 <= nof and acc2 > acc) or (nof2 < nof and acc2 >= acc):
                isDominated = True
                break
            
        if not isDominated:
            nondominated_solutions[nof] = acc
    
    return nondominated_solutions


In [47]:

# store all candidate solutions in a hash map as 'chromosome: (number_of_features, accuracy)'
all_evaluations = dict()

POP_SIZE = 100
GEN_SIZE = 200

# NSGA initial setup
creator.create("FitnessMulti", base.Fitness, weights = (-1.0, 1.0))
creator.create("Individual", list, fitness=creator.FitnessMulti)

toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, generateIndividual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("mate", HUXcrossover) 
toolbox.register("mutate", tools.mutFlipBit, indpb = 0.02)
toolbox.register("select", tools.selNSGA2)
toolbox.register("evaluate", evaluate)

# generate the initial population
pop = toolbox.population(n = POP_SIZE)

write_path = 'C:/Users/Ayca/Desktop/results/SST/'

classifiers = ['LR', 'SVM']
for cf in classifiers:

    start_time = time.time()
    solutions = NSGA(cf, GEN_SIZE, write_path)
    end_time = time.time()
    
    writer = open(write_path + 'SST_time_' + str(POP_SIZE) + "p_" + str(GEN_SIZE) + "g_" + cf + '.txt', 'w')
    writer.write("Execution time:\n")
    writer.write("%.3f" % (end_time - start_time) + "\n\n")
    writer.write("Non-dominated solutions:\n")
    for sol_k, sol_v in solutions.items():
        writer.write("%d" % sol_k + "\t" + "%.3f" % sol_v + "\n")
    writer.close()
    
    print('Execution time of %s: %s seconds' % (cf, round(end_time - start_time, 1)))
    print('Non-dominated solutions:')
    print(solutions)
    print('------------------------')
    
    all_evaluations = dict()


Initial population is ready.
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200
Execution time of LR: 30564.8 seconds
Non-dominated solutions:
{3967.0: 0.843