In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
logisticRegr = LogisticRegression()
random_forest = RandomForestClassifier(n_estimators=100)

In [3]:
train=pd.read_csv('D:/kaggle2/Train.csv')
test=pd.read_csv('D:/kaggle2/Test.csv')
print(train.shape)
print(test.shape)

(1009, 14)
(300, 13)


In [4]:
train = train.drop(labels=['home.dest', 'name', 'ticket', 'cabin','body','boat'], axis=1) 
test = test.drop(labels=['home.dest', 'name', 'ticket', 'cabin','body','boat'], axis=1) 

In [13]:
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 11 columns):
pclass         1009 non-null float64
survived       1009 non-null float64
sex            1009 non-null int32
age            1009 non-null int32
sibsp          1009 non-null float64
parch          1009 non-null float64
fare           1009 non-null float64
embarked_C     1009 non-null uint8
embarked_Q     1009 non-null uint8
embarked_S     1009 non-null uint8
family size    1009 non-null float64
dtypes: float64(6), int32(2), uint8(3)
memory usage: 58.2 KB
None


In [5]:
#data wrangling
#converting character data to int type
def wrangle(dataset):
    # sex {male, female} to {0, 1}
    dataset['sex'] = dataset['sex'].map( {'female': 1, 'male': 0} ).astype(int)
    
    # embarked {S, C, Q} => 3 binary variables
    embarked_separate_port = pd.get_dummies(dataset['embarked'], prefix='embarked')
    dataset = pd.concat([dataset, embarked_separate_port], axis=1)
    return dataset.drop('embarked', axis=1)
train=wrangle(train)
test=wrangle(test)

In [14]:
train.head()
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 11 columns):
pclass         1009 non-null float64
survived       1009 non-null float64
sex            1009 non-null int32
age            1009 non-null int32
sibsp          1009 non-null float64
parch          1009 non-null float64
fare           1009 non-null float64
embarked_C     1009 non-null uint8
embarked_Q     1009 non-null uint8
embarked_S     1009 non-null uint8
family size    1009 non-null float64
dtypes: float64(6), int32(2), uint8(3)
memory usage: 58.2 KB


In [6]:
# we'll fill in incomplete values of age. We will do that by calculating the median value found by using age values
#for different sex and class. As we have two sexes (1, 0) and three classes (1, 2, 3), 
#we will have 6 combinations and we will calculate the age from each combination.
guess_ages = np.zeros((2,3))
for i in range(0, 2):
    for j in range(0, 3):
        guess_data = train[(train['sex'] == i) & (train['pclass'] == j+1)]['age'].dropna()
        age_guess = guess_data.median()
        # Convert random age float to nearest .5 age
        guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
 
def wrangle_age(dataset):
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.age.isnull()) & (dataset.sex == i) & (dataset.pclass == j+1),'age'] = guess_ages[i,j]
    dataset['age'] = dataset['age'].astype(int)
    return dataset
 
train = wrangle_age(train)
test = wrangle_age(test)

In [7]:
train['family size']=train['sibsp']+train['parch']+1
test['family size']=test['sibsp']+test['parch']+1

In [8]:
print(train["fare"].mean())
train['fare']=train['fare'].fillna(train["fare"].mean())

33.64741220238095


In [9]:
x_train=train.drop(['survived'],axis=1)
y_train=train['survived']
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 10 columns):
pclass         1009 non-null float64
sex            1009 non-null int32
age            1009 non-null int32
sibsp          1009 non-null float64
parch          1009 non-null float64
fare           1009 non-null float64
embarked_C     1009 non-null uint8
embarked_Q     1009 non-null uint8
embarked_S     1009 non-null uint8
family size    1009 non-null float64
dtypes: float64(5), int32(2), uint8(3)
memory usage: 50.3 KB


In [10]:
logisticRegr.fit(x_train,y_train)
random_forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [12]:
ans=logisticRegr.predict(test).astype(int)
ans1=random_forest.predict(test).astype(int)
print(ans)
print(ans1)
j=300
index=[]
for i in range(j):
    index.append(i)
res=pd.DataFrame({'Id':index,'survived':ans1})
res.head()
res.to_csv('D:/kaggle2/resultf.csv',index=False)

[0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 1 0 1 0 1 0
 0 0 0 1 1 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0
 1 0 1 1 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1 0 1 0 1
 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 1 1 0 1 0 0 0 0 1 0 1 1 0 0 0 1 1 0 0
 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 1 0 1 1 0 1 1 0 0 1 0 1 1 0 0 0 0 1 0 1
 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 1 1 1 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0
 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 1 0 1 0 1 0 1 0 0 0 0
 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 1 1 0 1 1 0 0 0 1
 1 0 0 1]
[1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 1 0 1 0 1 0
 0 1 1 1 1 0 1 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0 1 1 1 0 0 1 0 0 0 1 1 0 0 0
 1 0 1 1 0 0 1 0 0 1 1 0 0 1 1 1 1 0 0 0 1 1 1 1 1 0 1 0 0 1 0 0 1 0 1 0 1
 0 1 0 1 1 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 1 1 0 0 1 0 0 1 1 1 0 0
 1 0 0 0 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 1 1 0 1 1 0 0 1 0 1 1 1 0 0 0 1 1 1
 0 0 0 0 0 1 0 