In [35]:
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
import pandas as pd
import numpy as np
import re

In [2]:
train_df = pd.read_csv('0000000000002429_training_titanic_x_y_train.csv')
test_df = pd.read_csv('0000000000002429_test_titanic_x_test.csv')

In [3]:
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668 entries, 0 to 667
Data columns (total 11 columns):
Pclass      668 non-null int64
Name        668 non-null object
Sex         668 non-null object
Age         536 non-null float64
SibSp       668 non-null int64
Parch       668 non-null int64
Ticket      668 non-null object
Fare        668 non-null float64
Cabin       154 non-null object
Embarked    667 non-null object
Survived    668 non-null int64
dtypes: float64(2), int64(4), object(5)
memory usage: 57.5+ KB


In [4]:
train_df.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived
count,668.0,536.0,668.0,668.0,668.0,668.0
mean,2.296407,29.70056,0.528443,0.407186,32.064552,0.402695
std,0.831638,14.240257,1.080327,0.854695,45.320835,0.490808
min,1.0,0.67,0.0,0.0,0.0,0.0
25%,2.0,21.0,0.0,0.0,7.925,0.0
50%,3.0,29.0,0.0,0.0,14.75,0.0
75%,3.0,38.25,1.0,0.0,31.275,1.0
max,3.0,80.0,8.0,6.0,512.3292,1.0


In [5]:
train_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S,0


In [7]:
total = train_df.isnull().sum().sort_values(ascending=False)

In [8]:
percent = (train_df.isnull().sum()/train_df.isnull().count())*100

In [21]:
missing_data = pd.concat([total,percent],axis=1,keys=['Total','%'],sort=False)

In [22]:
missing_data

Unnamed: 0,Total,%
Cabin,514,76.946108
Age,132,19.760479
Embarked,1,0.149701
Survived,0,0.0
Fare,0,0.0
Ticket,0,0.0
Parch,0,0.0
SibSp,0,0.0
Sex,0,0.0
Name,0,0.0


In [26]:
train_df.columns.values

array(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked', 'Survived'], dtype=object)

In [28]:
test_df.columns

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')

In [44]:
data = [train_df,test_df]
for dataset in data:
    mean = train_df["Age"].mean()
    std = train_df["Age"].std()
    is_null = dataset["Age"].isnull().sum()
    rand_age = np.random.randint(mean-std,mean+std,size = is_null)
    age_slice = dataset["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset["Age"] = age_slice
    dataset["Age"] = train_df["Age"].astype(int)

In [45]:
train_df["Age"].isnull().sum()

0

In [49]:
train_df["Embarked"].describe()

count     667
unique      3
top         S
freq      484
Name: Embarked, dtype: object

In [50]:
common_value = train_df["Embarked"].describe().top

In [51]:
data = [train_df,test_df]
for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].fillna(common_value)

In [52]:
genders = {"male":0,"female":1}
data = [train_df,test_df]
for dataset in data:
    dataset["Sex"] = dataset["Sex"].map(genders)

In [53]:
train_df

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",1,29,1,0,228414,26.0000,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",0,42,0,0,A/5 2466,8.0500,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",0,39,0,0,250655,26.0000,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",1,29,0,4,349909,21.0750,,S,0
4,3,"Sutehall, Mr. Henry Jr",0,25,0,0,SOTON/OQ 392076,7.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...
663,2,"Ilett, Miss. Bertha",1,17,0,0,SO/C 14885,10.5000,,S,1
664,3,"Morrow, Mr. Thomas Rowan",0,27,0,0,372622,7.7500,,Q,0
665,3,"Bing, Mr. Lee",0,32,0,0,1601,56.4958,,S,1
666,3,"Strandberg, Miss. Ida Sofia",1,22,0,0,7553,9.8375,,S,0


In [54]:
ports = {"S":0,"C":1,"Q":2}
data = [train_df,test_df]
for dataset in data:
    dataset["Embarked"] = dataset["Embarked"].map(ports)

In [55]:
train_df

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",1,29,1,0,228414,26.0000,,0,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",0,42,0,0,A/5 2466,8.0500,,0,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",0,39,0,0,250655,26.0000,,0,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",1,29,0,4,349909,21.0750,,0,0
4,3,"Sutehall, Mr. Henry Jr",0,25,0,0,SOTON/OQ 392076,7.0500,,0,0
...,...,...,...,...,...,...,...,...,...,...,...
663,2,"Ilett, Miss. Bertha",1,17,0,0,SO/C 14885,10.5000,,0,1
664,3,"Morrow, Mr. Thomas Rowan",0,27,0,0,372622,7.7500,,2,0
665,3,"Bing, Mr. Lee",0,32,0,0,1601,56.4958,,0,1
666,3,"Strandberg, Miss. Ida Sofia",1,22,0,0,7553,9.8375,,0,0


In [57]:
train_df = train_df.drop(['Ticket'],axis=1)
test_df = test_df.drop(['Ticket'],axis=1)

In [58]:
train_df

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",1,29,1,0,26.0000,,0,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",0,42,0,0,8.0500,,0,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",0,39,0,0,26.0000,,0,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",1,29,0,4,21.0750,,0,0
4,3,"Sutehall, Mr. Henry Jr",0,25,0,0,7.0500,,0,0
...,...,...,...,...,...,...,...,...,...,...
663,2,"Ilett, Miss. Bertha",1,17,0,0,10.5000,,0,1
664,3,"Morrow, Mr. Thomas Rowan",0,27,0,0,7.7500,,2,0
665,3,"Bing, Mr. Lee",0,32,0,0,56.4958,,0,1
666,3,"Strandberg, Miss. Ida Sofia",1,22,0,0,9.8375,,0,0


In [59]:
train_df = train_df.drop(['Cabin'],axis=1)
test_df = test_df.drop(['Cabin'],axis=1)

In [61]:
train_df = train_df.drop(['Name'],axis=1)
test_df = test_df.drop(['Name'],axis=1)

In [62]:
train_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,2,1,29,1,0,26.0000,0,1
1,3,0,42,0,0,8.0500,0,0
2,2,0,39,0,0,26.0000,0,0
3,3,1,29,0,4,21.0750,0,0
4,3,0,25,0,0,7.0500,0,0
...,...,...,...,...,...,...,...,...
663,2,1,17,0,0,10.5000,0,1
664,3,0,27,0,0,7.7500,2,0
665,3,0,32,0,0,56.4958,0,1
666,3,1,22,0,0,9.8375,0,0


In [64]:
X_train = train_df.drop("Survived",axis=1)
Y_train = train_df["Survived"]
test_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,2,0,29,1,1,36.7500,0
1,1,1,42,0,0,25.9292,0
2,3,0,39,0,0,7.7375,2
3,2,1,29,2,1,27.0000,0
4,1,0,25,0,0,26.2875,0
...,...,...,...,...,...,...,...
218,3,0,30,1,0,7.9250,0
219,1,0,25,0,0,26.5500,0
220,1,1,29,1,0,108.9000,1
221,3,0,20,0,0,6.4500,0


In [66]:
algo = LogisticRegression()
algo.fit(X_train,Y_train)
Y_pred = algo.predict(test_df)



In [67]:
Y_pred

array([0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 0])

In [68]:
np.savetxt('Titanic_Dataset_ans.csv', Y_pred, delimiter=',') 