# Kaggle Titanic
This notebook is made for test preprocessing titanic data

In [1]:
import pandas as pd
import pandas_profiling
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

%matplotlib inline

In [2]:
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")

In [3]:
data = pd.concat([train, test], axis=0, sort=False)
data = data[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "Survived"]]

In [4]:
data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
age_avg = data['Age'].mean()
age_std = data['Age'].std()
data['Age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True)
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
data['Embarked'].fillna(('S'), inplace=True)

In [5]:
data.isnull().sum()

Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked      0
Survived    418
dtype: int64

In [6]:
data_embarked = data['Embarked'].values.reshape(-1, 1)
data_embarked

array([['S'],
       ['C'],
       ['S'],
       ...,
       ['S'],
       ['S'],
       ['C']], dtype=object)

In [7]:
np.unique(data_embarked)

array(['C', 'Q', 'S'], dtype=object)

In [8]:
len(data_embarked)

1309

In [9]:
oh_encoder = OneHotEncoder(sparse=False)
oh_encoder.fit(data_embarked)
onehot = pd.DataFrame(oh_encoder.transform(data_embarked), 
                        columns=oh_encoder.get_feature_names(),
                        index=data.index,
                        dtype=np.int8)
onehot.rename(columns={"x0_C":"Embarked_C", "x0_Q":"Embarked_Q", "x0_S":"Embarked_S"}, inplace=True)
len(onehot), onehot[:10]

(1309,
    Embarked_C  Embarked_Q  Embarked_S
 0           0           0           1
 1           1           0           0
 2           0           0           1
 3           0           0           1
 4           0           0           1
 5           0           1           0
 6           0           0           1
 7           0           0           1
 8           0           0           1
 9           1           0           0)

In [10]:
data_embarked[:10]

array([['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['C']], dtype=object)

In [11]:
data = data.drop(columns=["Embarked"])

In [12]:
data = pd.concat([data, onehot], axis=1)
data.isnull().sum()

Pclass          0
Sex             0
Age             0
SibSp           0
Parch           0
Fare            0
Survived      418
Embarked_C      0
Embarked_Q      0
Embarked_S      0
dtype: int64

In [13]:
train = data[:len(train)]
test = data[len(train):]

In [14]:
y_train = train['Survived']
X_train = train.drop('Survived', axis=1)
X_test = test.drop('Survived', axis=1)

In [15]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,0,22.0,1,0,7.2500,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,0
2,3,1,26.0,0,0,7.9250,0,0,1
3,1,1,35.0,1,0,53.1000,0,0,1
4,3,0,35.0,0,0,8.0500,0,0,1
...,...,...,...,...,...,...,...,...,...
886,2,0,27.0,0,0,13.0000,0,0,1
887,1,1,19.0,0,0,30.0000,0,0,1
888,3,1,23.0,1,2,23.4500,0,0,1
889,1,0,26.0,0,0,30.0000,1,0,0


In [17]:
clf = LogisticRegression(solver='liblinear')
clf.fit(X_train, y_train)

LogisticRegression(solver='liblinear')