In [123]:
# import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [37]:
train = pd.read_csv("train.csv")

### strategy / thoughts
- Name, Ticket, Cabin are unimportant features
- maybe cain number is important (represents location in ship?) but requires a lot of feature engineering (one hot ,PCA?)
- how to deal with missing data - now the age and cabin, are mostly null data (177, 687, respectively) with 2 embarked missing rows

#### for missing data:
- Embarked: use mode/ maybe predict what port they embarked at by setting embarked as the y and all other features as x?, maybe just delete rows
- Age: use median
- cabin: use special character to denote missing value "U"


#### results:
##### without looking at other people's notebooks
- accurary is: 0.769 for decision tree, 0.820 for random forest (100 trees), 0.813 for logistic regression, train/test split: 0.33
##### taking into account lessons from other people's notebooks
- use cross validation to pick hyper parameters
- use feature selection (sklearn.feature_selection.RFECV) to pick best features?
- use 

In [67]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [103]:
def clean_data(train):
    cleaned_data = train.copy()
    
    # # drop names and ticket, cabin as well since one hotting cabin adds too many dimensions
    cleaned_data.drop(columns=["Name", "Ticket", "Cabin"], inplace=True)
    
    # replace null value for embarked with mode 
    cleaned_data["Embarked"].fillna(str(train["Embarked"].mode()), inplace=True)
    
    # # replace null value for cabin with new value "U"
    # cleaned_data["Cabin"].fillna("U", inplace=True)

    # replace null value for age with average
    cleaned_data["Age"].fillna(cleaned_data["Age"].mean(), inplace=True)
    # one hot variables such as sex and embarked
    sex = cleaned_data.pop("Sex")
    embarked = cleaned_data.pop("Embarked")
    temp = pd.concat([cleaned_data, pd.get_dummies(sex), pd.get_dummies(embarked)], axis=1)
    try:
        temp.pop('0    S\ndtype: object')
    except:
        pass
    temp.interpolate(inplace=True)
    return temp
cleaned_data = clean_data(train)

In [87]:
y = cleaned_data.pop("Survived")
X = cleaned_data
X.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,1,3,22.0,1,0,7.25,0,1,0,0,1
1,2,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,3,26.0,0,0,7.925,1,0,0,0,1
3,4,1,35.0,1,0,53.1,1,0,0,0,1
4,5,3,35.0,0,0,8.05,0,1,0,0,1


In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [124]:
dt_model = DecisionTreeClassifier().fit(X_train,y_train)
rf_model = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
lr_model = LogisticRegression().fit(X_train, y_train)



In [125]:
print(f"accurary is: {accuracy_score(y_test, dt_model.predict(X_test), normalize=True)} for decision tree classfier")
print(f"accurary is: {accuracy_score(y_test, rf_model.predict(X_test), normalize=True)} for random forest classfier")
print(f"accurary is: {accuracy_score(y_test, lr_model.predict(X_test), normalize=True)} for linear regression")


accurary is: 0.7593220338983051 for decision tree classfier
accurary is: 0.8169491525423729 for random forest classfier
accurary is: 0.8135593220338984 for linear regression


In [98]:
test = pd.read_csv("test.csv")
cleaned_test = clean_data(test)
cleaned_test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
0,892,3,34.5,0,0,7.8292,0,1,0,1,0
1,893,3,47.0,1,0,7.0,1,0,0,0,1
2,894,2,62.0,0,0,9.6875,0,1,0,1,0
3,895,3,27.0,0,0,8.6625,0,1,0,0,1
4,896,3,22.0,1,1,12.2875,1,0,0,0,1


In [102]:
test = pd.read_csv("test.csv")
cleaned_test = clean_data(test)
submission = model.predict(cleaned_test)
submission

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,