In [2]:
import pylab, random

In [None]:
def minkowskiDist(v1, v2, p):
    """Assumes v1 and v2 are equal-length arrays of numbers
       Returns Minkowski distance of order p between v1 and v2
       L1 Elucidan is straight line A-B (circle)
       L2 Manhatten only moves vertically or horizontally (diamond)
       LP Minkowski (square circle)
       """
    dist = 0.0
    for i in range(len(v1)):
        dist += abs(v1[i] - v2[i])**p
    return dist**(1/p)

In [34]:
class Passenger(object):
    """Represents a passenger object with features such as 
       class, age, gender, survival label, and name.
       Provides methods to calculate distance from another passenger, 
       and to access specific features."""
    
    featureNames = ('C2', 'C3', 'age', 'male gender')

    def __init__(self, pClass, age, gender, survived, name):
        self.name = name
        if pClass == 2:
            self.featureVec = [1, 0, age, gender]
        elif pClass == 3:
            self.featureVec = [0, 1, age, gender]
        else:
            self.featureVec = [0, 0, age, gender]
        self.label = survived
        self.cabinClass = pClass
        
    def distance(self, other):
        return minkowskiDist(self.featureVec, other.featureVec, 2)
    def getClass(self):
        return self.cabinClass
    def getAge(self):
        return self.featureVec[2]
    def getGender(self):
        return self.featureVec[3]
    def getName(self):
        return self.name
    def getFeatures(self):
        return self.featureVec[:]
    def getLabel(self):
        return self.label

In [35]:
def getTitanicData(fname):
    """Reads data from a file and returns a dictionary containing information about passengers."""
    data = {}
    data['class'], data['survived'], data['age'] = [], [], []
    data['gender'], data['name'] = [], []
    f = open(fname)
    line = f.readline()
    while line != '':
        split = line.split(',')
        data['class'].append(int(split[0]))
        data['age'].append(float(split[1]))
        if split[2] == 'M':
            data['gender'].append(1)
        else:
            data['gender'].append(0)
        if split[3] == '1':
            data['survived'].append('Survived')
        else:
            data['survived'].append('Died')
        data['name'].append(split[4:])
        line = f.readline()
    return data

# titanic_dict = getTitanicData('TitanicPassengers.txt')
# for key in tiatnic_dict.keys():
    # print("Key:", key, "Value:", df[key])

In [47]:
def buildTitanicExamples(fileName):
    """Builds passenger examples from the titanic dataset and stores them as objects"""
    data = getTitanicData(fileName)
    examples = []
    for i in range(len(data['class'])):
        p = Passenger(data['class'][i], data['age'][i],
                      data['gender'][i], data['survived'][i],
                      data['name'][i])
        examples.append(p)
    print('Finished processing', len(examples), 'passengers\n')
    return examples

titanic_list = buildTitanicExamples('TitanicPassengers.txt')
for each in titanic_list:
    print(each.getName(), each.getClass())

Finished processing 1046 passengers

['Allen', ' Miss. Elisabeth Walton\n'] 1
['Allison', ' Master. Hudson Trevor\n'] 1
['Allison', ' Miss. Helen Loraine\n'] 1
['Allison', ' Mr. Hudson Joshua Creighton\n'] 1
['Allison', ' Mrs. Hudson J C (Bessie Waldo Daniels)\n'] 1
['Anderson', ' Mr. Harry\n'] 1
['Andrews', ' Miss. Kornelia Theodosia\n'] 1
['Andrews', ' Mr. Thomas Jr\n'] 1
['Appleton', ' Mrs. Edward Dale (Charlotte Lamson)\n'] 1
['Artagaveytia', ' Mr. Ramon\n'] 1
['Astor', ' Col. John Jacob\n'] 1
['Astor', ' Mrs. John Jacob (Madeleine Talmadge Force)\n'] 1
['Aubart', ' Mme. Leontine Pauline\n'] 1
['Barber', ' Miss. Ellen ""Nellie""\n'] 1
['Barkworth', ' Mr. Algernon Henry Wilson\n'] 1
['Baxter', ' Mr. Quigg Edmond\n'] 1
['Baxter', ' Mrs. James (Helene DeLaudeniere Chaput)\n'] 1
['Bazzani', ' Miss. Albina\n'] 1
['Beattie', ' Mr. Thomson\n'] 1
['Beckwith', ' Mr. Richard Leonard\n'] 1
['Beckwith', ' Mrs. Richard Leonard (Sallie Monypeny)\n'] 1
['Behr', ' Mr. Karl Howell\n'] 1
['Bidois', 

In [None]:
def accuracy(truePos, falsePos, trueNeg, falseNeg):
    numerator = truePos + trueNeg
    denominator = truePos + trueNeg + falsePos + falseNeg
    return numerator/denominator

def sensitivity(truePos, falseNeg):
    try:
        return truePos/(truePos + falseNeg)
    except ZeroDivisionError:
        return float('nan')
    
def specificity(trueNeg, falsePos):
    try:
        return trueNeg/(trueNeg + falsePos)
    except ZeroDivisionError:
        return float('nan')
    
def posPredVal(truePos, falsePos):
    try:
        return truePos/(truePos + falsePos)
    except ZeroDivisionError:
        return float('nan')
    
def negPredVal(trueNeg, falseNeg):
    try:
        return trueNeg/(trueNeg + falseNeg)
    except ZeroDivisionError:
        return float('nan')

def getStats(truePos, falsePos, trueNeg, falseNeg, toPrint = True):
    """Prints out stats on a model. Use when running multiple models and want results"""
    accur = accuracy(truePos, falsePos, trueNeg, falseNeg)
    sens = sensitivity(truePos, falseNeg)
    spec = specificity(trueNeg, falsePos)
    ppv = posPredVal(truePos, falsePos)
    if toPrint:
        print(' Accuracy =', round(accur, 3))
        print(' Sensitivity =', round(sens, 3))
        print(' Specificity =', round(spec, 3))
        print(' Pos. Pred. Val. =', round(ppv, 3))
    return (accur, sens, spec, ppv)


def split80_20(examples):
    """Splits up data into training and testing data
    The Examples come from titanic training set"""
    # generates a random list of sample indices
    sampleIndices = random.sample(range(len(examples)), len(examples)//5) 
    trainingSet, testSet = [], []
    for i in range(len(examples)):
        if i in sampleIndices:
            testSet.append(examples[i])
        else:
            trainingSet.append(examples[i])
    return trainingSet, testSet
    
def randomSplits(examples, method, numSplits, toPrint = True):
    """randomSplits(examples, lr, 20)"""
    truePos, falsePos, trueNeg, falseNeg = 0, 0, 0, 0
    random.seed(0)
    for t in range(numSplits):
        trainingSet, testSet = split80_20(examples) #split titanic into traning and test
        results = method(trainingSet, testSet) #lr 
        truePos += results[0]
        falsePos += results[1]
        trueNeg += results[2]
        falseNeg += results[3]
    getStats(truePos/numSplits, falsePos/numSplits,
             trueNeg/numSplits, falseNeg/numSplits, toPrint)
    return truePos/numSplits, falsePos/numSplits,\
             trueNeg/numSplits, falseNeg/numSplits

In [4]:
import sklearn
from sklearn.linear_model import LogisticRegression

def buildModel(examples, toPrint = True):
    featureVecs, labels = [],[]
    for e in examples:
        featureVecs.append(e.getFeatures())
        labels.append(e.getLabel())
    model = LogisticRegression().fit(featureVecs, labels)
    if toPrint:
        print('model.classes_ =', model.classes_)
        for i in range(len(model.coef_)):
            print('For label', model.classes_[1])
            for j in range(len(model.coef_[0])):
                print('   ', Passenger.featureNames[j], '=',
                      model.coef_[0][j])
    return model

In [6]:
def applyModel(model, testSet, label, prob = 0.5):
    testFeatureVecs = [e.getFeatures() for e in testSet]
    probs = model.predict_proba(testFeatureVecs)
    truePos, falsePos, trueNeg, falseNeg = 0, 0, 0, 0
    for i in range(len(probs)):
        if probs[i][1] > prob:
            if testSet[i].getLabel() == label:
                truePos += 1
            else:
                falsePos += 1
        else:
            if testSet[i].getLabel() != label:
                trueNeg += 1
            else:
                falseNeg += 1
    return truePos, falsePos, trueNeg, falseNeg

def lr(trainingData, testData, prob = 0.5):
    model = buildModel(trainingData, False)
    results = applyModel(model, testData, 'Survived', prob)
    return results
    
examples = buildTitanicExamples('TitanicPassengers.txt')

random.seed(0)
numSplits = 20
print('Average of', numSplits, '80/20 splits LR')
truePos, falsePos, trueNeg, falseNeg =\
      randomSplits(examples, lr, numSplits)

#Look at weights
trainingSet, testSet = split80_20(examples)
model = buildModel(trainingSet, True)

#Look at changing prob
random.seed(0)
trainingSet, testSet = split80_20(examples)
model = buildModel(trainingSet, False)
print('Try p = 0.1')
truePos, falsePos, trueNeg, falseNeg =\
                  applyModel(model, testSet, 'Survived', 0.1)
getStats(truePos, falsePos, trueNeg, falseNeg)
print('Try p = 0.9')
truePos, falsePos, trueNeg, falseNeg =\
                  applyModel(model, testSet, 'Survived', 0.9)
getStats(truePos, falsePos, trueNeg, falseNeg)

Finished processing 1046 passengers

Average of 20 80/20 splits LR
 Accuracy = 0.778
 Sensitivity = 0.683
 Specificity = 0.845
 Pos. Pred. Val. = 0.758
model.classes_ = ['Died' 'Survived']
For label Survived
    C2 = -1.2220347359247863
    C3 = -2.0992389657915176
    age = -0.031621641164859285
    male gender = -2.433459439069997
Try p = 0.1
 Accuracy = 0.493
 Sensitivity = 0.976
 Specificity = 0.161
 Pos. Pred. Val. = 0.444
Try p = 0.9
 Accuracy = 0.656
 Sensitivity = 0.176
 Specificity = 0.984
 Pos. Pred. Val. = 0.882


(0.6555023923444976,
 0.17647058823529413,
 0.9838709677419355,
 0.8823529411764706)

In [None]:
def buildROC(trainingSet, testSet, title, plot = True):
    model = buildModel(trainingSet, True)
    xVals, yVals = [], []
    p = 0.0
    while p <= 1.0:
        truePos, falsePos, trueNeg, falseNeg =\
                               applyModel(model, testSet,
                               'Survived', p)
        xVals.append(1.0 - specificity(trueNeg, falsePos))
        yVals.append(sensitivity(truePos, falseNeg))
        p += 0.01
    auroc = sklearn.metrics.auc(xVals, yVals, True)
    if plot:
        pylab.plot(xVals, yVals)
        pylab.plot([0,1], [0,1])
        title = title + '\nAUROC = ' + str(round(auroc,3))
        pylab.title(title)
        pylab.xlabel('1 - specificity')
        pylab.ylabel('Sensitivity')
    return auroc

#random.seed(0)
#trainingSet, testSet = split80_20(examples)
#buildROC(trainingSet, testSet, 'ROC for Predicting Survival, 1 Split')