# Getting started with Python

In [1]:
import numpy as np
import csv as csv

## Read CSV into python object

In [2]:
csv_file_object = csv.reader(open('./train.csv', 'rb'))

## Use next() to skip headers

In [3]:
header = csv_file_object.next()
print header

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


## Create a variable called 'data'

In [4]:
data = []

## Add each row in csv file to data array

In [5]:
for row in csv_file_object:
    data.append(row)

## Then convert from a list to an array
### Be aware that each item is currently a string in this format

In [6]:
data = np.array(data)

In [7]:
print data

[['1' '0' '3' ..., '7.25' '' 'S']
 ['2' '1' '1' ..., '71.2833' 'C85' 'C']
 ['3' '1' '3' ..., '7.925' '' 'S']
 ..., 
 ['889' '0' '3' ..., '23.45' '' 'S']
 ['890' '1' '1' ..., '30' 'C148' 'C']
 ['891' '0' '3' ..., '7.75' '' 'Q']]


### Examine first row

In [8]:
print data[0]

['1' '0' '3' 'Braund, Mr. Owen Harris' 'male' '22' '1' '0' 'A/5 21171'
 '7.25' '' 'S']


### Examine last row

In [9]:
print data[-1]

['891' '0' '3' 'Dooley, Mr. Patrick' 'male' '32' '0' '0' '370376' '7.75' ''
 'Q']


### Examine first row, 4th column

In [10]:
print data[0,3]

Braund, Mr. Owen Harris


# Play with the data

## Now if you want to call a specific column of data, say, the gender column, I can just type data[0::,4], remembering that "0::" means all (from start to end), and Python starts indices from 0 (not 1).

## You should be aware that the csv reader works by default with strings, so you will need to convert to floats in order to do numerical calculations. For example, you can turn the Pclass variable into floats by using data[0::,2].astype(np.float). Using this, we can calculate the proportion of survivors on the Titanic: 

In [11]:
# The size() function counts how many elements are in
# in the array and sum() (as you would expects) sums up
# the elements in the array.

number_passengers = np.size(data[0::,1].astype(np.float))
number_survived = np.sum(data[0::,1].astype(np.float))
proportion_survivors = number_survived / number_passengers

In [12]:
print 'Number of passengers=' + str(number_passengers)
print 'Number of survivors=' + str(number_survived)
print 'Proportion of survivors=' + str(proportion_survivors)

Number of passengers=891
Number of survivors=342.0
Proportion of survivors=0.383838383838


## Number of female and male that survived

In [13]:
women_only_stats = data[0::,4] == "female"
men_only_stats = data[0::,4] == "male"

In [14]:
print 'Female survivors=' + str(women_only_stats)
print 'Male survivors=' + str(men_only_stats)

Female survivors=[False  True  True  True False False False False  True  True  True  True
 False False  True  True False False  True  True False False  True False
  True  True False False  True False False  True  True False False False
 False False  True  True  True  True False  True  True False False  True
 False  True False False  True  True False False  True False  True False
 False  True False False False False  True False  True False False  True
 False False False False False False False  True False False  True False
  True  True False False  True False False False False False False False
 False False  True False  True False False False False False  True False
 False  True False  True False  True  True False False False False  True
 False False False  True False False False False  True False False False
  True  True False False  True False False False  True  True  True False
 False False False  True False False False  True False False False False
  True False False False False  Tr

## We use these two new variables as a "mask" on our original train data, so we can select only those women, and only those men on board, then calculate the proportion of those who survived:

In [15]:
# Using the index from above we select the females and males separately
# Column 1 is "Survived"
women_onboard = data[women_only_stats,1].astype(np.float)     
men_onboard = data[men_only_stats,1].astype(np.float)


In [16]:
print 'women_onboard=' + str(women_onboard)
print 'men_onboard=' + str(men_onboard)

women_onboard=[ 1.  1.  1.  1.  1.  1.  1.  0.  1.  0.  1.  1.  0.  1.  1.  1.  1.  0.
  1.  0.  0.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.
  1.  1.  1.  0.  1.  1.  0.  0.  0.  0.  1.  1.  0.  1.  1.  0.  1.  1.
  0.  1.  1.  1.  1.  0.  1.  0.  0.  1.  1.  1.  1.  1.  1.  1.  0.  0.
  1.  1.  1.  1.  1.  0.  1.  1.  0.  1.  0.  1.  0.  1.  0.  0.  1.  1.
  1.  1.  1.  0.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  0.  0.  1.  1.
  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  0.  1.  1.  1.  1.  0.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  0.  1.  0.  0.  0.  1.  0.  1.  1.  0.  0.
  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.
  1.  0.  1.  1.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  0.  1.  1.  1.  1.  0.  0.  1.  1.  1.  1.  1.  0.  0.  1.  1.  1.  1.
  0.  1.  1.  1.  1.  0.  1.  1.  1.  1.  0.  1.  1.  0.  1.  1.  0.  1.
  0.  1.  0.  1.  1.  1.  1.  0.  0. 

In [17]:
# Then we finds the proportions of them that survived
proportion_women_survived = \
                       np.sum(women_onboard) / np.size(women_onboard)  
proportion_men_survived = \
                       np.sum(men_onboard) / np.size(men_onboard) 

# and then print it out
print 'Proportion of women who survived is %s' % proportion_women_survived
print 'Proportion of men who survived is %s' % proportion_men_survived

Proportion of women who survived is 0.742038216561
Proportion of men who survived is 0.188908145581


# Read test data and create gender based model as csv

In [18]:
test_file = open('./test.csv', 'rb')
test_file_object = csv.reader(test_file)
header = test_file_object.next()

## Now, let's open a pointer to a new file so we can write to it (this file does not exist yet). Call it something descriptive so that it is recognizable when we upload it:

In [19]:
prediction_file = open("genderbasedmodel.csv", "wb")
prediction_file_object = csv.writer(prediction_file)

## We now want to read in the test file row by row, see if it is female or male, and write our survival prediction to a new file. 

In [20]:
# Write the header first
prediction_file_object.writerow(["PassengerId", "Survived"])

# Female prediction = 1(survived), Male prediction = 0 (dead)
# 4th column is the gender, 1st column is the passengerId
for row in test_file_object:
    passengerId = row[0]
    if row[3] == 'female':         
        prediction_file_object.writerow([passengerId,'1'])
    else:                                     
        prediction_file_object.writerow([passengerId,'0'])
        
test_file.close()
prediction_file.close()

# Generate model with gender, class, price

## The idea is to create a table which contains just 1's and 0's. The array will be a survival reference table, whereby you read in the test data, find out passenger attributes, look them up in the survival table, and determine if they should be predicted to survive or not. In the case of a model that uses gender, class, and ticket price, you will need an array of 2x3x4 ( [female/male] , [1st / 2nd / 3rd class], [4 bins of prices] ).

In [21]:
# So we add a ceiling
fare_ceiling = 40
# then modify the data in the Fare column to = 39, if it is greater or equal to the ceiling
data[ data[0::,9].astype(np.float) >= fare_ceiling, 9 ] = fare_ceiling - 1.0

# Number of price brackets = 4
fare_bracket_size = 10
number_of_price_brackets = fare_ceiling / fare_bracket_size

# I know there were 1st, 2nd and 3rd classes on board
number_of_classes = 3

# But it's better practice to calculate this from the data directly
# Take the length of an array of unique values in column index 2
number_of_classes = len(np.unique(data[0::,2])) 

# Initialize the survival table with all zeros
survival_table = np.zeros((2, number_of_classes, number_of_price_brackets))

In [22]:
np.zeros((2,3,4))

array([[[ 0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.]],

       [[ 0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.]]])

## Now that these are set up, you can loop through each variable and find all those passengers that agree with the statements:

In [23]:
for i in xrange(number_of_classes):       #loop through each class
  for j in xrange(number_of_price_brackets):   #loop through each price bin

    # fare_bracket_size is 10, so we are checking if the fare is greater than this bracket and less than the next bracket
    women_only_stats = data[                          
                        (data[0::,4] == "female")    
                       &(data[0::,2].astype(np.float) == i+1)                       
                       &(data[0:,9].astype(np.float)  >= j*fare_bracket_size)   
                       &(data[0:,9].astype(np.float)  < (j+1)*fare_bracket_size)
                          , 1]                        

    men_only_stats = data[                            
                         (data[0::,4] != "female")    
                       &(data[0::,2].astype(np.float) == i+1)                  
                       &(data[0:,9].astype(np.float)  >= j*fare_bracket_size)   
                       &(data[0:,9].astype(np.float)  < (j+1)*fare_bracket_size)
                          , 1] 
    
    survival_table[0,i,j] = np.mean(women_only_stats.astype(np.float)) 
    survival_table[1,i,j] = np.mean(men_only_stats.astype(np.float))

  ret = ret.dtype.type(ret / rcount)


## Notice that  data[ where function, 1]  means it is finding the Survived column for the conditional criteria which is being called. As the loop starts with i=0 and j=0, the first loop will return the Survived values for all the 1st-class females (i + 1) who paid less than 10 ((j+1)*fare_bracket_size) and similarly all the 1st-class males who paid less than 10.  Before resetting to the top of the loop, we can calculate the proportion of survivors for this particular combination of criteria and record it to our survival table:

## Note! A Runtime warning will show when the loop is run, but it won't affect the output. This approach created a problem if there are no passengers in a given category. For example, in reality no females paid less than 10 dollars for a first class ticket, so Python will return a nan for the mean, since it is dividing by zero. To deal with these, we could set them to 0 using a simple statement:

In [24]:
    survival_table[ survival_table != survival_table ] = 0.

In [25]:
print survival_table

[[[ 0.          0.          0.83333333  0.97727273]
  [ 0.          0.91428571  0.9         1.        ]
  [ 0.59375     0.58139535  0.33333333  0.125     ]]

 [[ 0.          0.          0.4         0.38372093]
  [ 0.          0.15873016  0.16        0.21428571]
  [ 0.11153846  0.23684211  0.125       0.24      ]]]


## Each of these numbers is the proportion of survivors for that criteria of passengers. For example, 0.91428571 signifies 91.4% of female, Pclass = 2, in the Fare bin of 10-19. The numbers should look familiar to you from the Pivot table in the previous Excel tutorial.  For our second model, let's again assume any probability greater than or equal to 0.5 should result in our predicting survival -- and less than 0.5 should not. We can update our survival table with:

In [26]:
survival_table[ survival_table < 0.5 ] = 0
survival_table[ survival_table >= 0.5 ] = 1 

In [27]:
print survival_table

[[[ 0.  0.  1.  1.]
  [ 0.  1.  1.  1.]
  [ 1.  1.  0.  0.]]

 [[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]]


# Use survival table to predict test data

In [28]:
test_file = open('./test.csv', 'rb')
test_file_object = csv.reader(test_file)
header = test_file_object.next()
predictions_file = open("./genderclassfaremodel.csv", "wb")
p = csv.writer(predictions_file)
p.writerow(["PassengerId", "Survived"])

## As with the previous model, we can take the first passenger, look at his/her gender, class, and price of ticket, and assign a Survived label. The problem is that each passenger in the test.csv file is not binned. We should loop through each bin and see if the price of their ticket falls in that bin. If so, we can break the loop (so we don’t go through all the bins) and assign that bin:

In [29]:
for row in test_file_object:                 # We are going to loop
                                              # through each passenger
                                              # in the test set                     
  for j in xrange(number_of_price_brackets):  # For each passenger we
                                              # loop thro each price bin
    try:                                      # Some passengers have no
                                              # Fare data so try to make
      row[8] = float(row[8])                  # a float
    except:                                   # If fails: no data, so 
      bin_fare = 3 - float(row[1])            # bin the fare according Pclass
      break                                   # Break from the loop
    if row[8] > fare_ceiling:              # If there is data see if
                                              # it is greater than fare
                                              # ceiling we set earlier
      bin_fare = number_of_price_brackets-1   # If so set to highest bin
      break                                   # And then break loop
    if row[8] >= j * fare_bracket_size\
       and row[8] < \
       (j+1) * fare_bracket_size:             # If passed these tests 
                                              # then loop through each bin 
      bin_fare = j                            # then assign index
      break             
        

  if row[3] == 'female':                             #If the passenger is female
    p.writerow([row[0], "%d" % \
               int(survival_table[0, float(row[1])-1, bin_fare])])
  else:                                          #else if male
    p.writerow([row[0], "%d" % \
               int(survival_table[1, float(row[1])-1, bin_fare])])



In [30]:
# Close out the files.
test_file.close() 
predictions_file.close()