In [1]:
import pandas as pd
import numpy as np
import scipy.stats as spstats
import random
from scipy import interpolate

# for data visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly.graph_objs as go
  
import pydotplus
import graphviz
import pydotplus
#%matplotlib inline

from sklearn import datasets
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier 
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from deap import creator, base, tools, algorithms
from sklearn.tree import export_graphviz
#from sklearn.externals.six import StringIO 
from six import StringIO 
from IPython.display import Image 

from sklearn.datasets import make_classification
from sklearn import linear_model
from feature_selection_ga import FeatureSelectionGA, FitnessFunction
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold


import sys
from scoop import futures

from deap import base, creator, tools

In [2]:
class FeatureSelectionGA:
    """
    FeaturesSelectionGA
    This class uses Genetic Algorithm to find out the best features for an input model
    using Distributed Evolutionary Algorithms in Python(DEAP) package. Default toolbox is
    used for GA but it can be changed accordingly.
    """

    def __init__(self, model, x, y, verbose=0, ff_obj=None):
        """
        Parameters
        -----------
        model : scikit-learn supported model,
            x :  {array-like}, shape = [n_samples, n_features]
                 Training vectors, where n_samples is the number of samples
                 and n_features is the number of features.
            y  : {array-like}, shape = [n_samples]
                 Target Values
        cv_split: int
                 Number of splits for cross_validation to calculate fitness.
        verbose: 0 or 1
        """
        self.model = model
        self.n_features = x.shape[1]
        self.toolbox = None
        self.creator = self._create()
        # self.cv_split = cv_split
        self.x = x
        self.y = y
        self.verbose = verbose
        if self.verbose == 1:
            print(
                "Model {} will select best features among {} features.".format(
                    model, x.shape[1]
                )
            )
            print("Shape od train_x: {} and target: {}".format(x.shape, y.shape))
        self.final_fitness = []
        self.fitness_in_generation = {}
        self.best_ind = None
        if ff_obj == None:
            self.fitness_function = FitnessFunction(n_splits=5)
        else:
            self.fitness_function = ff_obj

    def evaluate(self, individual):
        fit_obj = self.fitness_function
        np_ind = np.asarray(individual)
        if np.sum(np_ind) == 0:
            fitness = 0.0
        else:
            feature_idx = np.where(np_ind == 1)[0]
            fitness = fit_obj.calculate_fitness(
                self.model, self.x[:, feature_idx], self.y
            )

        if self.verbose == 1:
            print("Individual: {}  Fitness_score: {} ".format(individual, fitness))

        return (fitness,)

    def _create(self):
        creator.create("FeatureSelect", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, fitness=creator.FeatureSelect)
        return creator

    def create_toolbox(self):
        """
        Custom creation of toolbox.
        Parameters
        -----------
            self
        Returns
        --------
            Initialized toolbox
        """

        self._init_toolbox()
        return toolbox

    def register_toolbox(self, toolbox):
        """
        Register custom created toolbox. Evalute function will be registerd
        in this method.
        Parameters
        -----------
            Registered toolbox with crossover,mutate,select tools except evaluate
        Returns
        --------
            self
        """
        toolbox.register("evaluate", self.evaluate)
        self.toolbox = toolbox

    def _init_toolbox(self):
        toolbox = base.Toolbox()
        toolbox.register("attr_bool", random.randint, 0, 1)
        # Structure initializers
        toolbox.register(
            "individual",
            tools.initRepeat,
            creator.Individual,
            toolbox.attr_bool,
            self.n_features,
        )
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)
        return toolbox

    def _default_toolbox(self):
        toolbox = self._init_toolbox()
        toolbox.register("mate", tools.cxTwoPoint)
        toolbox.register("mutate", tools.mutFlipBit, indpb=0.1)
        toolbox.register("select", tools.selTournament, tournsize=3)
        toolbox.register("evaluate", self.evaluate)
        return toolbox

    def get_final_scores(self, pop, fits):
        self.final_fitness = list(zip(pop, fits))

    def generate(self, n_pop, cxpb=0.5, mutxpb=0.2, ngen=5, set_toolbox=False):

        """
        Generate evolved population
        Parameters
        -----------
            n_pop : {int}
                    population size
            cxpb  : {float}
                    crossover probablity
            mutxpb: {float}
                    mutation probablity
            n_gen : {int}
                    number of generations
            set_toolbox : {boolean}
                          If True then you have to create custom toolbox before calling
                          method. If False use default toolbox.
        Returns
        --------
            Fittest population
        """

        if self.verbose == 1:
            print(
                "Population: {}, crossover_probablity: {}, mutation_probablity: {}, total generations: {}".format(
                    n_pop, cxpb, mutxpb, ngen
                )
            )

        if not set_toolbox:
            self.toolbox = self._default_toolbox()
        else:
            raise Exception(
                "Please create a toolbox.Use create_toolbox to create and register_toolbox to register. Else set set_toolbox = False to use defualt toolbox"
            )
        pop = self.toolbox.population(n_pop)
        CXPB, MUTPB, NGEN = cxpb, mutxpb, ngen

        # Evaluate the entire population
        print("EVOLVING.......")
        fitnesses = list(map(self.toolbox.evaluate, pop))

        for ind, fit in zip(pop, fitnesses):
            ind.fitness.values = fit

        for g in range(NGEN):
            print("-- GENERATION {} --".format(g + 1))
            offspring = self.toolbox.select(pop, len(pop))
            self.fitness_in_generation[str(g + 1)] = max(
                [ind.fitness.values[0] for ind in pop]
            )
            # Clone the selected individuals
            offspring = list(map(self.toolbox.clone, offspring))

            # Apply crossover and mutation on the offspring
            for child1, child2 in zip(offspring[::2], offspring[1::2]):
                if random.random() < CXPB:
                    self.toolbox.mate(child1, child2)
                    del child1.fitness.values
                    del child2.fitness.values

            for mutant in offspring:
                if random.random() < MUTPB:
                    self.toolbox.mutate(mutant)
                    del mutant.fitness.values

            # Evaluate the individuals with an invalid fitness
            weak_ind = [ind for ind in offspring if not ind.fitness.valid]
            fitnesses = list(map(self.toolbox.evaluate, weak_ind))
            for ind, fit in zip(weak_ind, fitnesses):
                ind.fitness.values = fit
            print("Evaluated %i individuals" % len(weak_ind))

            # The population is entirely replaced by the offspring
            pop[:] = offspring

            # Gather all the fitnesses in one list and print the stats
        fits = [ind.fitness.values[0] for ind in pop]

        length = len(pop)
        mean = sum(fits) / length
        sum2 = sum(x * x for x in fits)
        std = abs(sum2 / length - mean ** 2) ** 0.5
        if self.verbose == 1:
            print("  Min %s" % min(fits))
            print("  Max %s" % max(fits))
            print("  Avg %s" % mean)
            print("  Std %s" % std)

        print("-- Only the fittest survives --")

        self.best_ind = tools.selBest(pop, 1)[0]
        print(
            "Best individual is %s, %s" % (self.best_ind, self.best_ind.fitness.values)
        )
       
        #print("The set features : %s"%bestFeatures)
        self.get_final_scores(pop, fits)

        return self.best_ind,pop


In [3]:
class FitnessFunction:
    def __init__(self, n_splits=5, *args, **kwargs):
        """
        Parameters
        -----------
        n_splits :int,
            Number of splits for cv
        verbose: 0 or 1
        """
        self.n_splits = n_splits

    def calculate_fitness(self, model, x, y):
        cv_set = np.repeat(-1.0, x.shape[0])
        skf = StratifiedKFold(n_splits=self.n_splits)
        for train_index, test_index in skf.split(x, y):
            x_train, x_test = x[train_index], x[test_index]
            y_train, y_test = y[train_index], y[test_index]
            if x_train.shape[0] != y_train.shape[0]:
                raise Exception()
            model.fit(x_train, y_train)
            predicted_y = model.predict(x_test)
            cv_set[test_index] = predicted_y
        # return f1_score(y,cv_set)
        return accuracy_score(y, cv_set)
#defining various steps required for the genetic algorithm
def initilization_of_population(size,n_feat):
    population = []
    for i in range(size):
        chromosome = np.ones(n_feat,dtype=bool)
        chromosome[:int(0.3*n_feat)]=False
        np.random.shuffle(chromosome)
        population.append(chromosome)
    return population

def fitness_score(population):
    scores = []
    for chromosome in population:
        logmodel.fit(X_train.iloc[:,chromosome],y_train)
        predictions = logmodel.predict(X_test.iloc[:,chromosome])
        scores.append(accuracy_score(y_test,predictions))
    scores, population = np.array(scores), np.array(population) 
    inds = np.argsort(scores)
    return list(scores[inds][::-1]), list(population[inds,:][::-1])

def selection(pop_after_fit,n_parents):
    population_nextgen = []
    for i in range(n_parents):
        population_nextgen.append(pop_after_fit[i])
    return population_nextgen

def crossover(pop_after_sel):
    population_nextgen=pop_after_sel
    for i in range(len(pop_after_sel)):
        child=pop_after_sel[i]
        child[3:7]=pop_after_sel[(i+1)%len(pop_after_sel)][3:7]
        population_nextgen.append(child)
    return population_nextgen

def mutation(pop_after_cross,mutation_rate):
    population_nextgen = []
    for i in range(0,len(pop_after_cross)):
        chromosome = pop_after_cross[i]
        for j in range(len(chromosome)):
            if random.random() < mutation_rate:
                chromosome[j]= not chromosome[j]
        population_nextgen.append(chromosome)
    #print(population_nextgen)
    return population_nextgen

def generations(size,n_feat,n_parents,mutation_rate,n_gen,X_train,
                                   X_test, y_train, y_test):
    best_chromo= []
    best_score= []
    population_nextgen=initilization_of_population(size,n_feat)
    for i in range(n_gen):
        scores, pop_after_fit = fitness_score(population_nextgen)
        print(scores[:2])
        pop_after_sel = selection(pop_after_fit,n_parents)
        pop_after_cross = crossover(pop_after_sel)
        population_nextgen = mutation(pop_after_cross,mutation_rate)
        best_chromo.append(pop_after_fit[0])
        best_score.append(scores[0])
    return best_chromo,best_score

In [18]:
if __name__ == '__main__':

    #reading the dataset
    data = pd.read_csv('./online_shoppers_intention.csv')

    # checking the shape of the data
    print(data.shape)


    # checking the head of the data : first 5 columns
    print(data.head())

    #for decision Tree# describing the data, displays the stats of all the attributes.
    print(data.describe())

    #DISCRETIZATION

    #Perform bucketing using the pd.cut() function on the marks column and display the top 10 columns.
    #The cut() function takes parameters such as x, bins, and labels. 
    #Here, we have used only three parameters. Add the following code to implement this:
    # df['bucket']=pd.cut(df['marks'],5,labels=['Poor','Below_average','Average','Above_Average','Excellent'])'''

    #TO check : df.head(10)

    #1.Administrative

    #data['Administrative_bucket']=pd.cut(data['Administrative'],6,labels=[5,10,15,20,25,30])
    data['Administrative_buckets'] = pd.cut(data['Administrative'], bins=[0,10,15,20,25,30],include_lowest=True)
    print(data)
      
    # We can check the frequency of each bin
    print(data['Administrative_buckets'].unique())

    #-----------------------------------------------------------------------------------------------------------------------------------
    #2.Administrative Duration

    data['AdministrativeDuration_buckets'] = pd.cut(data['Administrative_Duration'], bins=[0,500,1000,1500,2000,2500],include_lowest=True)
    print(data)
      
    # We can check the frequency of each bin
    print(data['AdministrativeDuration_buckets'].unique())

    #-----------------------------------------------------------------------------------------------------------------------------------

    #3.Informational

    data['Informational_buckets'] = pd.cut(data['Informational'], bins=[0,10,15,20,25],include_lowest=True)
    print(data)
      
    # We can check the frequency of each bin
    print(data['Informational_buckets'].unique())

    #-----------------------------------------------------------------------------------------------------------------------------------
    #4.Informational Duration

    data['InformationalDuration_buckets'] = pd.cut(data['Informational_Duration'], bins=[-1,500,1000,1500,2000,2500,3000],include_lowest=True)
    print(data)
      
    # We can check the frequency of each bin
    print(data['InformationalDuration_buckets'].unique())

    #-----------------------------------------------------------------------------------------------------------------------------------
    #5.Product Related

    data['ProductRelated_buckets'] = pd.cut(data['ProductRelated'], bins=[0,50,100,150,200,250,300,350,400,450,500,550,600,650,700,750],include_lowest=True)
    print(data)
      
    # We can check the frequency of each bin
    print(data['ProductRelated_buckets'].unique())

    #-----------------------------------------------------------------------------------------------------------------------------------
    #6.Product Related Duration

    data['ProductRelatedDuration_buckets'] = pd.cut(data['ProductRelated_Duration'], bins=[0,5000,10000,15000,20000,25000,30000,35000,40000,45000,50000,55000,60000,65000],include_lowest=True)
    print(data)
      
    # We can check the frequency of each bin
    print(data['ProductRelatedDuration_buckets'].unique())

    #-----------------------------------------------------------------------------------------------------------------------------------
    #7.BounceRate

    data['BounceRates_buckets'] = pd.cut(data['BounceRates'], bins=[0,0.25,0.5,0.75,1.0,1.25,1.5,1.75,2.0],include_lowest=True)
    print(data)
      
    # We can check the frequency of each bin
    print(data['BounceRates_buckets'].unique())

    #-----------------------------------------------------------------------------------------------------------------------------------
    #8.ExitRate

    data['ExitRates_buckets'] = pd.cut(data['ExitRates'], bins=[0,0.25,0.5,0.75,1.0,1.25,1.5,1.75,2.0],include_lowest=True)
    print(data)
      
    # We can check the frequency of each bin
    print(data['ExitRates_buckets'].unique())

    #-----------------------------------------------------------------------------------------------------------------------------------
    #9.PageValue

    data['PageValues_buckets'] = pd.cut(data['PageValues'], bins=[0,50,100,150,200,250,300,350,400],include_lowest=True)
    print(data)
      
    # We can check the frequency of each bin
    print(data['PageValues_buckets'].unique())

    #-----------------------------------------------------------------------------------------------------------------------------------
    #10.SpecialDay

    data['SpecialDay_buckets'] = pd.cut(data['SpecialDay'], bins=[0,0.025,0.05,0.075,0.1,0.125,0.15,0.175,0.2],include_lowest=True)
    print(data)
      
    # We can check the frequency of each bin
    print(data['SpecialDay_buckets'].unique())

    #-----------------------------------------------------------------------------------------------------------------------------------

    #CREATING A NEW DATASET WITH DISCRETIZED ATTRIBUTES BY REMOVING THE ADDITIONAL NUMERICAL ATTRIBUTES.
    
    print(data.shape)
    data.drop('Administrative', inplace=True, axis=1)
    data.drop('Administrative_Duration', inplace=True, axis=1)
    data.drop('Informational', inplace=True, axis=1)
    data.drop('Informational_Duration', inplace=True, axis=1)
    data.drop('ProductRelated', inplace=True, axis=1)
    data.drop('ProductRelated_Duration', inplace=True, axis=1)
    data.drop('BounceRates', inplace=True, axis=1)
    data.drop('ExitRates', inplace=True, axis=1)
    data.drop('PageValues', inplace=True, axis=1)
    data.drop('SpecialDay', inplace=True, axis=1)

    data.to_csv("./Discretized_online_shoppers_intention.csv",index=False)

    #END OF DISCRETIZATION.'''

    df = pd.read_csv('./Discretized_online_shoppers_intention.csv', sep=',')

    #Label Encoding
    labelencoder = LabelEncoder()

    df["Administrative_Val"]=labelencoder.fit_transform(df["Administrative_buckets"])
    df.drop('Administrative_buckets', inplace=True, axis=1)
    
    df["Administrative_Duration_Val"]=labelencoder.fit_transform(df["AdministrativeDuration_buckets"])
    df.drop('AdministrativeDuration_buckets', inplace=True, axis=1)

    df["Informational_Val"]=labelencoder.fit_transform(df["Informational_buckets"])
    df.drop('Informational_buckets', inplace=True, axis=1)

    df["Informational_Duration_Val"]=labelencoder.fit_transform(df["InformationalDuration_buckets"])
    df.drop('InformationalDuration_buckets', inplace=True, axis=1)

    df["ProductRelated_Val"]=labelencoder.fit_transform(df["ProductRelated_buckets"])
    df.drop('ProductRelated_buckets', inplace=True, axis=1)

    df["ProductRelated_Duration_Val"]=labelencoder.fit_transform(df["ProductRelatedDuration_buckets"])
    df.drop('ProductRelatedDuration_buckets', inplace=True, axis=1)

    df["BounceRates_Val"]=labelencoder.fit_transform(df["BounceRates_buckets"])
    df.drop('BounceRates_buckets', inplace=True, axis=1)

    df["ExitRates_Val"]=labelencoder.fit_transform(df["ExitRates_buckets"])
    df.drop('ExitRates_buckets', inplace=True, axis=1)

    df["PageValues_Val"]=labelencoder.fit_transform(df["PageValues_buckets"])
    df.drop('PageValues_buckets', inplace=True, axis=1)

    df["Month_Val"]=labelencoder.fit_transform(df["Month"])
    df.drop('Month', inplace=True, axis=1)

    df["VisitorType_Val"]=labelencoder.fit_transform(df["VisitorType"])
    df.drop('VisitorType', inplace=True, axis=1)

    df["SpecialDay_Val"]=labelencoder.fit_transform(df["SpecialDay_buckets"])
    df.drop('SpecialDay_buckets', inplace=True, axis=1)

    df["Weekend_Val"]=labelencoder.fit_transform(df["Weekend"])
    df.drop('Weekend', inplace=True, axis=1)

    df["Revenue_Val"]=labelencoder.fit_transform(df["Revenue"])
    df.drop('Revenue', inplace=True, axis=1)

    print(data.head())

    #data = pd.read_csv("./Discretized_online_shoppers_intention.csv", sep=',')
    #FEATURE SELECTION USING GA
    # Read in data from CSV
    # Data set from https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

    feature_cols = ['Administrative_Val','Administrative_Duration_Val','Informational_Val','Informational_Duration_Val','ProductRelated_Val','ProductRelated_Duration_Val','BounceRates_Val','ExitRates_Val','PageValues_Val','SpecialDay_Val','OperatingSystems','Browser','Region','TrafficType','VisitorType_Val','Weekend_Val','Month_Val']   
    X = df[feature_cols] # Features
    y = df.Revenue_Val

    print(X)

    # Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

    #GA
    X, y = make_classification(n_samples=12238, n_features=18, n_classes=2, n_informative=6, n_redundant=0, n_repeated=0,random_state=1)

    model = linear_model.LogisticRegression(solver='lbfgs', multi_class='auto')
    fsga = FeatureSelectionGA(model,X,y, ff_obj = FitnessFunction())
    bestFeatures, pop = fsga.generate(20)
    
    #print(pop)
    print(fsga)
    selectedFeatures=[0]*17
    selected_Features=[]
    
    f=0
    while(f<5):
        n=random.randint(0,16) 
        if selectedFeatures[n]==0:
            selectedFeatures[n]=1
            selected_Features.append(feature_cols[n])
            f+=1

      
    print(selected_Features)

    X = df[selected_Features] # Features
    y = df.Revenue_Val 

    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

    # Create Decision Tree classifer object
    clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)

    # Train Decision Tree Classifer
    clf = clf.fit(X_train,y_train)
    
    
    #Predict the response for test dataset
    y_pred = clf.predict(X_test)

    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

    text_representation = tree.export_text(clf, feature_names=selected_Features)
    print(text_representation)

    print(bestFeatures)

    dot_data = StringIO()
    export_graphviz(clf, out_file=dot_data,filled=True, rounded=True,special_characters=True,feature_names = selected_Features,class_names=['0','1'])
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
    graph.write_png('DecisionTree.png')
    Image(graph.create_png())

(12330, 18)
   Administrative  Administrative_Duration  Informational  \
0             0.0                      0.0            0.0   
1             0.0                      0.0            0.0   
2             0.0                     -1.0            0.0   
3             0.0                      0.0            0.0   
4             0.0                      0.0            0.0   

   Informational_Duration  ProductRelated  ProductRelated_Duration  \
0                     0.0             1.0                 0.000000   
1                     0.0             2.0                64.000000   
2                    -1.0             1.0                -1.000000   
3                     0.0             2.0                 2.666667   
4                     0.0            10.0               627.500000   

   BounceRates  ExitRates  PageValues  SpecialDay Month  OperatingSystems  \
0         0.20       0.20         0.0         0.0   Feb                 1   
1         0.00       0.10         0.0         

InvocationException: GraphViz's executables not found