In [1]:
import pandas as pd

In [5]:
titanic_data = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/titanic-train.csv.txt')

In [7]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Things to do
* Make PassengerId as Index column
* feature data is everything but Survived
* target data is survived column

In [8]:
titanic_data = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/titanic-train.csv.txt', index_col='PassengerId')

In [9]:
titanic_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
feature_data = titanic_data.drop(columns=['Survived'])

In [11]:
target_data = titanic_data.Survived

In [13]:
target_data.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [14]:
feature_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 10 columns):
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 76.6+ KB


In [17]:
feature_data = feature_data.drop(columns=['Name','Ticket','Cabin'])

### Next Steps
* Split data into different types
* For different type of data, have different pipeline
* In each pipeline, have imputers to handle missing data
* Combine the pipelines using column transformer
* Create bigger pipeline using columntransforner & estimator
* Split the data into train & test
* Train the pipeline
* Get base line accuracy with linear & non-linear models
* Now, get into hyper-parameter tuning with best model to improve that further

In [20]:
cat_feature_data = feature_data[['Sex','Embarked']]

In [22]:
float_int_feature_data = feature_data[['Age','SibSp','Parch','Fare']]

In [29]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler

In [26]:
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OrdinalEncoder())

In [28]:
float_int_pipeline = make_pipeline(SimpleImputer(strategy='median'), MinMaxScaler()) 

In [31]:
preprocessor = make_column_transformer(
    (cat_pipeline, cat_feature_data.columns),
    (float_int_pipeline, float_int_feature_data.columns),
    remainder='passthrough'

)

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [34]:
pipeline = make_pipeline(preprocessor, RandomForestClassifier())

In [39]:
pipeline = make_pipeline(preprocessor, LogisticRegression())

In [40]:
from sklearn.model_selection import train_test_split

In [41]:
trainX, testX, trainY, testY = train_test_split(feature_data, target_data)

In [42]:
pipeline.fit(trainX, trainY)



Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline-1',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='most_frequent',
                             

In [43]:
pipeline.score(testX,testY)

0.7802690582959642

In [48]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [49]:
names = ['RandomForest','GaussianNB', 'LogisticRegression', 'DecionTrees', 'KNN']
models = [RandomForestClassifier(n_estimators=1000), GaussianNB(), LogisticRegression(), DecisionTreeClassifier(), KNeighborsClassifier()]
for name,estimator in zip( names, models):
    pipeline = make_pipeline(preprocessor,estimator)
    pipeline.fit(trainX,trainY)
    print(name, pipeline.score(testX,testY))

RandomForest 0.7937219730941704
GaussianNB 0.7623318385650224
LogisticRegression 0.7802690582959642
DecionTrees 0.7443946188340808
KNN 0.7802690582959642


