In [30]:
%matplotlib inline
import pandas
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy
import csv
import string

In [41]:
train = pandas.read_csv("train.csv")
test = pandas.read_csv("test.csv")
print "All data read into train and test"
print ""
print "Train dimensions are "+ str(train.shape)
print "Test dimensions are "+ str(test.shape)
print ""

train.info()

All data read into train and test

Train dimensions are (891, 12)
Test dimensions are (418, 11)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [32]:
#Dropping PassengerId, Name, and Ticket because they are irrelevant
    #PassengerId is a uniquely assigned random number
    #Names have no value to important features
    #Ticket number has no value to important features either
#Dropping Cabin because the data has too many holes to extrapolate/fill in for both train and test


train.drop('PassengerId', 1, inplace = True)
train.drop('Name', 1, inplace = True)
train.drop('Ticket', 1, inplace = True)
train.drop('Cabin', 1, inplace = True)


test.drop('PassengerId', 1, inplace = True)
test.drop('Name', 1, inplace = True)
test.drop('Ticket', 1, inplace = True)
test.drop('Cabin', 1, inplace = True)


train.info()
print ""
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Age         332 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        417 non-null float64
Embarked    418 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 22.9+ KB


In [33]:
#Age, Embarked, and Fare are valuable pieces of information that are missing data, so we're filling in the blanks
#Age/Fare get the median age/fare and Embarked gets the mode of embarkments


ageReplaceTrain = train['Age'].fillna(train['Age'].median())
embarkedReplaceTrain = train['Embarked'].fillna(train['Embarked'].mode()[0])
train['Age'] = ageReplaceTrain
train['Embarked'] = embarkedReplaceTrain
#Fare is not necessary for train in this example

ageReplaceTest = test['Age'].fillna(test['Age'].median())
fareReplaceTest = test['Fare'].fillna(test['Fare'].median())
test['Age'] = ageReplaceTest
test['Fare'] = fareReplaceTest
#Embarked is not necessary for test in this example

train.info()
print ""
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Age         418 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
Embarked    418 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 22.9+ KB


In [34]:
#Age can be filtered to focus on babies, children, teenager, adults, 
#and elderly (all denoted with different numerical values)
#This is done to further numerically categorize a value into a range











#INCLUDE HISTOGRAM!!!!!!!












def processAge(tempData,cut_points,label_names):
    tempData["Age_categories"] = pandas.cut(tempData["Age"],cut_points,labels=label_names)
    return tempData

cut_points = [0,6,12,19,50,100]
label_names = ['Baby','Child','Teenager','Adult','Senior']

train = processAge(train,cut_points,label_names)
test = processAge(test,cut_points,label_names)


train.info()
#train.head(10)
print ""
test.info()
test.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived          891 non-null int64
Pclass            891 non-null int64
Sex               891 non-null object
Age               891 non-null float64
SibSp             891 non-null int64
Parch             891 non-null int64
Fare              891 non-null float64
Embarked          891 non-null object
Age_categories    891 non-null category
dtypes: category(1), float64(2), int64(4), object(2)
memory usage: 56.8+ KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
Pclass            418 non-null int64
Sex               418 non-null object
Age               418 non-null float64
SibSp             418 non-null int64
Parch             418 non-null int64
Fare              418 non-null float64
Embarked          418 non-null object
Age_categories    418 non-null category
dtypes: category(1), float64(2), int64(3), object(2)
memory usage: 2

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_categories
0,3,male,34.5,0,0,7.8292,Q,Adult
1,3,female,47.0,1,0,7.0,S,Adult
2,2,male,62.0,0,0,9.6875,Q,Senior
3,3,male,27.0,0,0,8.6625,S,Adult
4,3,female,22.0,1,1,12.2875,S,Adult
5,3,male,14.0,0,0,9.225,S,Teenager
6,3,female,30.0,0,0,7.6292,Q,Adult
7,2,male,26.0,1,1,29.0,S,Adult
8,3,female,18.0,0,0,7.2292,C,Teenager
9,3,male,21.0,2,0,24.15,S,Adult


In [35]:
#Pclass, Sex, and Embarked are all classes that can be converted into binary configuration for better results

def create_dummy(trainingData,columnName):
    dummyData = trainingData[columnName]
    dummyColumn = pandas.get_dummies(dummyData, prefix=columnName)
    trainingData = pandas.concat([trainingData, dummyColumn], axis=1)
    return trainingData

for column in ["Pclass","Sex","Embarked","Age_categories"]:
    train = create_dummy(train,column)
    test = create_dummy(test,column)

#Drop all of the original features that were converted into one-hot encoded features    
    
train.drop(['Sex'], 1, inplace=True)
train.drop(['Pclass'], 1, inplace=True)
train.drop(['Age'], 1, inplace=True)
train.drop(['Embarked'], 1, inplace=True)
train.drop(['Age_categories'], 1, inplace=True)

test.drop(['Sex'], 1, inplace=True)
test.drop(['Pclass'], 1, inplace=True)
test.drop(['Age'], 1, inplace=True)
test.drop(['Embarked'], 1, inplace=True)
test.drop(['Age_categories'], 1, inplace=True)

test.head()

Unnamed: 0,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Age_categories_Baby,Age_categories_Child,Age_categories_Teenager,Age_categories_Adult,Age_categories_Senior
0,0,0,7.8292,0,0,1,0,1,0,1,0,0,0,0,1,0
1,1,0,7.0,0,0,1,1,0,0,0,1,0,0,0,1,0
2,0,0,9.6875,0,1,0,0,1,0,1,0,0,0,0,0,1
3,0,0,8.6625,0,0,1,0,1,0,0,1,0,0,0,1,0
4,1,1,12.2875,0,0,1,1,0,0,0,1,0,0,0,1,0


In [36]:
featureList = ['SibSp','Parch','Fare','Pclass_1','Pclass_2','Pclass_3','Sex_female','Sex_male','Embarked_C','Embarked_Q','Embarked_S','Age_categories_Baby','Age_categories_Child','Age_categories_Teenager','Age_categories_Adult','Age_categories_Senior']
logReg = LogisticRegression()
logReg.fit(train[featureList], train["Survived"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [37]:
prediction = logReg.predict(test)

In [38]:

survivorData = pandas.read_csv("test.csv")
survivorColumn = survivorData['PassengerId']

survivorMatrix = numpy.column_stack((survivorColumn,prediction))

survivorMatrix.astype('Int32')

print survivorMatrix

[[ 892    0]
 [ 893    1]
 [ 894    0]
 [ 895    0]
 [ 896    0]
 [ 897    0]
 [ 898    1]
 [ 899    0]
 [ 900    1]
 [ 901    0]
 [ 902    0]
 [ 903    0]
 [ 904    1]
 [ 905    0]
 [ 906    1]
 [ 907    1]
 [ 908    0]
 [ 909    0]
 [ 910    1]
 [ 911    1]
 [ 912    0]
 [ 913    0]
 [ 914    1]
 [ 915    0]
 [ 916    1]
 [ 917    0]
 [ 918    1]
 [ 919    0]
 [ 920    0]
 [ 921    0]
 [ 922    0]
 [ 923    0]
 [ 924    0]
 [ 925    0]
 [ 926    0]
 [ 927    0]
 [ 928    1]
 [ 929    1]
 [ 930    0]
 [ 931    0]
 [ 932    0]
 [ 933    0]
 [ 934    0]
 [ 935    1]
 [ 936    1]
 [ 937    0]
 [ 938    1]
 [ 939    0]
 [ 940    1]
 [ 941    1]
 [ 942    0]
 [ 943    0]
 [ 944    1]
 [ 945    1]
 [ 946    0]
 [ 947    0]
 [ 948    0]
 [ 949    0]
 [ 950    0]
 [ 951    1]
 [ 952    0]
 [ 953    0]
 [ 954    0]
 [ 955    1]
 [ 956    0]
 [ 957    1]
 [ 958    1]
 [ 959    0]
 [ 960    1]
 [ 961    1]
 [ 962    1]
 [ 963    0]
 [ 964    1]
 [ 965    1]
 [ 966    1]
 [ 967    1]
 [ 968    0]

  if __name__ == '__main__':


In [39]:
numpy.savetxt("predictions.csv",survivorMatrix,header="PassengerId,Survived",delimiter=',',fmt="%d")


#DELETE THE # AT THE BEGINNING