In [1]:
import random
import operator

import pandas as pd
import numpy as np
import random as rnd
%matplotlib inline
import matplotlib.pyplot as plt

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from deap import gp
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix




In [2]:
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
# machine learning auxiliaries
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import make_pipeline

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [4]:
train_data.columns[train_data.isna().any()].tolist()

['Age', 'Cabin', 'Embarked']

In [5]:
train_data.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)
train_data.set_index(keys=['PassengerId'], drop=True, inplace=True)

#train_data.head()

test_data.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)
test_data.set_index(keys=['PassengerId'], drop=True, inplace=True)

train_nan_map = {'Age': train_data['Age'].mean(), 'Fare': train_data['Fare'].mean(), 'Embarked': train_data['Embarked'].mode()[0]}
test_nan_map = {'Age': test_data['Age'].mean(), 'Fare': test_data['Fare'].mean(), 'Embarked': test_data['Embarked'].mode()[0]}

train_data.fillna(value=train_nan_map, inplace=True)
test_data.fillna(value=test_nan_map, inplace=True)

columns_map = {'Embarked': {'C': 0, 'Q': 1, 'S': 2}, 'Sex': {'male': 0, 'female': 1}}
train_data.replace(columns_map, inplace=True)
test_data.replace(columns_map, inplace=True)

def family(df):
    df["Family"] = df["SibSp"] + df["Parch"] + 1
    df["Family"] = df["Family"] * df["Family"]
    return df

def wealth_age_fare_ratio(df):
    df["WealthAgeFareRatio"] = df["Pclass"] * df["Fare"] / df["Age"]
    #df["WealthAgeFareRatio"] = df["WealthAgeFareRatio"] * df["WealthAgeFareRatio"]
    df.loc[ df["WealthAgeFareRatio"] <= 0, "WealthAgeFareRatio"] = 0
    df.loc[(df["WealthAgeFareRatio"] > 0) & (df["WealthAgeFareRatio"] <= 7.90), "WealthAgeFareRatio"] = 2
    df.loc[(df["WealthAgeFareRatio"] > 7.90) & (df["WealthAgeFareRatio"] <= 10.84), "WealthAgeFareRatio"] = 4
    df.loc[(df["WealthAgeFareRatio"] > 10.84) & (df["WealthAgeFareRatio"] <= 19.8), "WealthAgeFareRatio"] = 16
    df.loc[ df["WealthAgeFareRatio"] > 19.8, "WealthAgeFareRatio"] = 256
    return df

def age_range(df):
    df.loc[ df["Age"] <= 0, "Age"] = 6
    df.loc[(df["Age"] > 0) & (df["Age"] <= 13), "Age"] = 5
    df.loc[(df["Age"] > 13) & (df["Age"] <= 18), "Age"] = 4
    df.loc[(df["Age"] > 18) & (df["Age"] <= 40), "Age"] = 3
    df.loc[(df["Age"] > 40) & (df["Age"] <= 65), "Age"] = 2
    df.loc[ df["Age"] > 65, "Age"] = 1
    return df

def class_range(df):
    df.loc[ df["Pclass"] <= 1, "Pclass"] = 1000
    df.loc[(df["Pclass"] > 1) & (df["Pclass"] <= 2), "Pclass"] = 100
    df.loc[(df["Pclass"] > 2) & (df["Pclass"] <= 3), "Pclass"] = 10
    return df

train_data = age_range(train_data)
test_data = age_range(test_data)

train_data = wealth_age_fare_ratio(train_data)
test_data = wealth_age_fare_ratio(test_data)

train_data = class_range(train_data)
test_data = class_range(test_data)

train_data = family(train_data)
test_data = family(test_data)

#train_data.drop(columns=['Fare'], inplace=True)
#test_data.drop(columns=['Fare'], inplace=True)
#train_data.to_csv('testtrain.csv', header=True, sep=',')
#test_data.to_csv('testtest.csv', header=True, sep=',')

X_train = train_data.loc[:, train_data.columns != 'Survived']
y_train = train_data.loc[:, 'Survived']

y_train.to_csv('y_train.csv', header=True, sep=',')

#X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=10)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.99, random_state=10)

print(X_train.head())
print(y_train.head())
print(test_data.head())


             Pclass  Sex  Age  SibSp  Parch     Fare  Embarked  \
PassengerId                                                      
734             100    0  3.0      0      0  13.0000         2   
157              10    1  4.0      0      0   7.7333         1   
124             100    1  3.0      0      0  13.0000         2   
370            1000    1  3.0      0      0  69.3000         0   
321              10    0  3.0      0      0   7.2500         2   

             WealthAgeFareRatio  Family  
PassengerId                              
734                         4.0       1  
157                         2.0       1  
124                         4.0       1  
370                       256.0       1  
321                         2.0       1  
PassengerId
734    0
157    1
124    1
370    1
321    0
Name: Survived, dtype: int64
             Pclass  Sex  Age  SibSp  Parch     Fare  Embarked  \
PassengerId                                                      
892              10    0 

In [6]:
train_data.columns[train_data.isna().any()].tolist()
train_data[train_data['Embarked'].isna() == True]
#train_data['Embarked']
#test_data.columns[test_data.isna().any()].tolist()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,WealthAgeFareRatio,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [7]:
creator.create("FitnessMin", base.Fitness, weights=(1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin)

In [8]:
pset = gp.PrimitiveSet("MAIN", arity=1)
pset.addPrimitive(np.add, arity=2)
pset.addPrimitive(np.subtract, arity=2)
pset.addPrimitive(np.multiply, arity=2)
pset.addPrimitive(np.negative, arity=1)
pset.addPrimitive(np.positive, arity=1)
pset.addPrimitive(np.sin, arity=1)
pset.addPrimitive(np.cos, arity=1)
pset.addPrimitive(np.tan, arity=1)
pset.renameArguments(ARG0='x')

In [9]:
toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=2)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)

In [10]:
def evalSymbReg(individual, points, pset):
    func = gp.compile(expr=individual, pset=pset)
    sqerrors = (func(points)-(points**4 + points**3 + points**2 + points))**2
    return (np.sqrt(np.sum(sqerrors) / len(points)),)

In [11]:
toolbox.register("evaluate", evalSymbReg, points=np.linspace(-1, 1, 1000), pset=pset)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("mate", gp.cxOnePoint)
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)
toolbox.register("insert" , gp.mutInsert, pset=pset)
toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))
toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))

In [12]:
def getModel(individual):
    k = individual[0]
    if k == 'svcLinear':
        clf = LinearSVC(C=individual[0])
    elif k == 'rbf':
        clf = SVC(kernel=k, C=individual[0],gamma=individual[1])
    else:
        #linear
        clf = SVC(kernel=k, C=individual[0])
    return clf

In [13]:
def getXy(individual):
    scols = list(cols)
    for i in range(len(individual[0:])):
        if individual[0+i]<1: scols.remove(cols[i])
    #print("Selected cols: ",scols)
    tcols = np.append(['Survived'],scols)
    df = training.loc[:,tcols].dropna()
    X = df.loc[:,scols]
    scaler = preprocessing.StandardScaler().fit(X)    
    #scaler= preprocessing.MinMaxScaler().fit(X)
    X = scaler.transform(X)
    y = np.ravel(df.loc[:,['Survived']])
    return [X,y,scols,scaler]

In [14]:
gen = range(40)
avg_list = []
max_list = []
min_list = []

pop = toolbox.population(n=883)

# Evaluate the entire population
fitnesses = list(map(toolbox.evaluate, pop))
for ind, fit in zip(pop, fitnesses):
    ind.fitness.values = fit

# Begin the evolution
for g in gen:
    print("-- Generation %i --" % g)

    # Select the next generation individuals
    offspring = toolbox.select(pop, len(pop))
    # Clone the selected individuals
    offspring = list(map(toolbox.clone, offspring))

    # Apply crossover and mutation on the offspring
    for child1, child2 in zip(offspring[::2], offspring[1::2]):
        if random.random() < 0.5:
            toolbox.mate(child1, child2)
            del child1.fitness.values
            del child2.fitness.values

    for mutant in offspring:
        if random.random() < 0.2:
            toolbox.mutate(mutant)
            del mutant.fitness.values

    # Evaluate the individuals with an invalid fitness
    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
    fitnesses = map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    # Replace population
    pop[:] = offspring

    # Gather all the fitnesses in one list and print the stats
    fits = [ind.fitness.values[0] for ind in pop]

    length = len(pop)
    mean = sum(fits) / length
    sum2 = sum(x*x for x in fits)
    std = abs(sum2 / length - mean**2)**0.5
    g_max = max(fits)
    g_min = min(fits)
        
    avg_list.append(mean)
    max_list.append(g_max)
    min_list.append(g_min)

    print("  Min %s" % g_min)
    print("  Max %s" % g_max)
    print("  Avg %s" % mean)
    print("  Std %s" % std)

print("-- End of (successful) evolution --")

best_ind = tools.selBest(pop, 1)[0]
print("Best individual is %s, %s" % (best_ind, best_ind.fitness.values))

-- Generation 0 --
  Min 0.4337245151489531
  Max 61.451809436860046
  Avg 1.79771218553244
  Std 4.567363248560968
-- Generation 1 --
  Min 0.3868993273134971
  Max 3276.1038036535792
  Avg 6.854575755619995
  Std 110.45698572235884
-- Generation 2 --
  Min 0.3868993273134971
  Max 222.89402459215046
  Avg 6.168052498712462
  Std 20.66653490320089
-- Generation 3 --
  Min 0.5056866194019045
  Max 479.88115661427827
  Avg 13.33329482976682
  Std 36.848220104015134
-- Generation 4 --
  Min 0.5056866194019045
  Max 479.6068849356344
  Avg 24.500896632722206
  Std 47.492585138899635
-- Generation 5 --
  Min 0.3868993273134971
  Max 371.2859344536495
  Avg 43.644197848414436
  Std 59.07946458146618
-- Generation 6 --
  Min 0.4337245151489531
  Max 3276.1491490694825
  Avg 75.93074583787343
  Std 169.665862711103
-- Generation 7 --
  Min 0.7775982335295882
  Max 3276.1491490694825
  Avg 112.59990235772976
  Std 195.51678852123004
-- Generation 8 --
  Min 0.7779404679928336
  Max 407267756.5

In [15]:
selCols = []
#filtering only numeric attributes
for col in test_data.columns:
    if(test_data[col].dtype == 'int64' or test_data[col].dtype == 'float64' or test_data[col].dtype == 'uint8'):
        selCols.append(col)  
cols = selCols
test_ind = best_ind.fitness.values[0]
model = getModel(best_ind)

Xy = getXy(test_ind)
colsSVM = Xy[2]
scaler = Xy[3]
print("Selected Features: ",colsSVM)

X_train = Xy[0]
Y_train = Xy[1]

model.fit( X_train , y_train )

IndexError: invalid index to scalar variable.

In [None]:
y_pred = model.predict(X_test)
y_truth = y_test.values

In [None]:
tn, fp, fn, tp = confusion_matrix(y_truth, y_pred).ravel()
print("Confusion Matrix")
print(confusion_matrix(y_truth, y_pred, labels=[0, 1]))
print("")
print("True Negatives", tn)
print("False Positives", fp)
print("False Negatives", fn)
print("True Positives", tp)

In [None]:
type(predictions)
pred_df = pd.DataFrame(predictions, index=test_data.index, columns=['Survived'])
type(pred_df)
pred_df.to_csv('predictions.csv', header=True, sep=',')