In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.naive_bayes import CategoricalNB, GaussianNB, MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Drop unuseful columns

In [3]:
df_to_use = ['Survived', 'Pclass', 'Sex', 'Age', 'Parch', 'SibSp', 'Embarked']
df2 = df[df_to_use]
df2

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,SibSp,Embarked
0,0,3,male,22.0,0,1,S
1,1,1,female,38.0,0,1,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,0,1,S
4,0,3,male,35.0,0,0,S
...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,S
887,1,1,female,19.0,0,0,S
888,0,3,female,,2,1,S
889,1,1,male,26.0,0,0,C


In [4]:
df2.describe()

Unnamed: 0,Survived,Pclass,Age,Parch,SibSp
count,891.0,891.0,714.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.381594,0.523008
std,0.486592,0.836071,14.526497,0.806057,1.102743
min,0.0,1.0,0.42,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0
50%,0.0,3.0,28.0,0.0,0.0
75%,1.0,3.0,38.0,0.0,1.0
max,1.0,3.0,80.0,6.0,8.0


# Fill N/A ages with age mean

In [5]:
df3 = df2.copy()
df3['Age'] = df2['Age'].fillna(df2['Age'].mean())
df3.describe()

Unnamed: 0,Survived,Pclass,Age,Parch,SibSp
count,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.381594,0.523008
std,0.486592,0.836071,13.002015,0.806057,1.102743
min,0.0,1.0,0.42,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0
50%,0.0,3.0,29.699118,0.0,0.0
75%,1.0,3.0,35.0,0.0,1.0
max,1.0,3.0,80.0,6.0,8.0


# Encode Sex

In [6]:
df3['Sex'] = df3['Sex'].apply(lambda x: 0 if x == 'male' else 1)
df3

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,SibSp,Embarked
0,0,3,0,22.000000,0,1,S
1,1,1,1,38.000000,0,1,C
2,1,3,1,26.000000,0,0,S
3,1,1,1,35.000000,0,1,S
4,0,3,0,35.000000,0,0,S
...,...,...,...,...,...,...,...
886,0,2,0,27.000000,0,0,S
887,1,1,1,19.000000,0,0,S
888,0,3,1,29.699118,2,1,S
889,1,1,0,26.000000,0,0,C


In [7]:
df3['Embarked'].value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

# Fill N/A Embarked with the most frequent property

In [8]:
df4 = df3.copy()
df4['Embarked'] = df4['Embarked'].fillna('S')
df4

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,SibSp,Embarked
0,0,3,0,22.000000,0,1,S
1,1,1,1,38.000000,0,1,C
2,1,3,1,26.000000,0,0,S
3,1,1,1,35.000000,0,1,S
4,0,3,0,35.000000,0,0,S
...,...,...,...,...,...,...,...
886,0,2,0,27.000000,0,0,S
887,1,1,1,19.000000,0,0,S
888,0,3,1,29.699118,2,1,S
889,1,1,0,26.000000,0,0,C


# Fill N/A cabins with some virtual values

In [9]:
# df5 = df4.copy()
# df5['Cabin'] = df5['Cabin'].fillna('XXX')
# df5

In [10]:
# lenc = LabelEncoder()
# df5['Cabin'] = lenc.fit_transform(df5['Cabin'])
# df5

# Encode Embark

In [11]:
df5 = df4.copy()
lenc = LabelEncoder()
df5['Embarked'] = lenc.fit_transform(df5['Embarked'])
df5

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,SibSp,Embarked
0,0,3,0,22.000000,0,1,2
1,1,1,1,38.000000,0,1,0
2,1,3,1,26.000000,0,0,2
3,1,1,1,35.000000,0,1,2
4,0,3,0,35.000000,0,0,2
...,...,...,...,...,...,...,...
886,0,2,0,27.000000,0,0,2
887,1,1,1,19.000000,0,0,2
888,0,3,1,29.699118,2,1,2
889,1,1,0,26.000000,0,0,0


In [12]:
X = df5.drop('Survived',axis='columns')
y = df5['Survived']

In [13]:
models = {
    'linear_regression': {
        'model': LinearRegression(),
        'params': {
            'normalize': [True, False]
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(),
        'params': {
            'C': [1,5,10,100],
            'max_iter': [200, 500]
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [5, 10, 100, 200]
        }
    },
    'gradient_boosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [5, 10, 100, 200],
            'learning_rate': [0.01, 0.1, 0.5]
        }
    }
}

In [14]:
scores = []

for model, param in models.items():
    clf = GridSearchCV(param['model'],param['params'],cv=5,return_train_score=False)
    clf.fit(X,y)
    scores.append({
        'model': model,
        'best_score': clf.best_score_,
        'best_param': clf.best_params_
    })
    
pd.DataFrame(scores,columns=['model','best_score','best_param'])

Unnamed: 0,model,best_score,best_param
0,linear_regression,0.373661,{'normalize': False}
1,logistic_regression,0.788996,"{'C': 1, 'max_iter': 200}"
2,random_forest,0.805882,{'n_estimators': 200}
3,gradient_boosting,0.830538,"{'learning_rate': 0.01, 'n_estimators': 100}"


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [23]:
clf = GradientBoostingClassifier(learning_rate=0.01,n_estimators=100)
clf.fit(X_train,y_train)

GradientBoostingClassifier(learning_rate=0.01)

In [24]:
clf.score(X_test,y_test)

0.8171641791044776

In [18]:
test = pd.read_csv('test.csv')
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [19]:
df_to_use = ['PassengerId','Pclass','Sex','Age','SibSp','Parch','Embarked']
test = test[df_to_use]

In [20]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch
count,418.0,418.0,332.0,418.0,418.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344
std,120.810458,0.841838,14.181209,0.89676,0.981429
min,892.0,1.0,0.17,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0
50%,1100.5,3.0,27.0,0.0,0.0
75%,1204.75,3.0,39.0,1.0,0.0
max,1309.0,3.0,76.0,8.0,9.0


In [21]:
test['Age'].fillna(test['Age'].mean(),inplace=True)
test

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Embarked
0,892,3,male,34.50000,0,0,Q
1,893,3,female,47.00000,1,0,S
2,894,2,male,62.00000,0,0,Q
3,895,3,male,27.00000,0,0,S
4,896,3,female,22.00000,1,1,S
...,...,...,...,...,...,...,...
413,1305,3,male,30.27259,0,0,S
414,1306,1,female,39.00000,0,0,C
415,1307,3,male,38.50000,0,0,S
416,1308,3,male,30.27259,0,0,S


In [22]:
test['Sex'] = test['Sex'].apply(lambda x: 0 if x == 'male' else 1)
test['Embarked'] = lenc.transform(test['Embarked'])
test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Sex'] = test['Sex'].apply(lambda x: 0 if x == 'male' else 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Embarked'] = lenc.transform(test['Embarked'])


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Embarked
0,892,3,0,34.50000,0,0,1
1,893,3,1,47.00000,1,0,2
2,894,2,0,62.00000,0,0,1
3,895,3,0,27.00000,0,0,2
4,896,3,1,22.00000,1,1,2
...,...,...,...,...,...,...,...
413,1305,3,0,30.27259,0,0,2
414,1306,1,1,39.00000,0,0,0
415,1307,3,0,38.50000,0,0,2
416,1308,3,0,30.27259,0,0,2


In [25]:
predicted = clf.predict(test.drop('PassengerId',axis='columns'))

In [32]:
pid = test['PassengerId']
pid = pd.DataFrame(pid,columns=['PassengerId'])
pid

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896
...,...
413,1305
414,1306
415,1307
416,1308


In [33]:
predicted = pd.DataFrame(predicted,columns=['Survived'])
predicted

Unnamed: 0,Survived
0,0
1,0
2,0
3,0
4,0
...,...
413,0
414,1
415,0
416,0


In [34]:
final = pd.concat([pid,predicted],axis='columns')
final

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [35]:
final.to_csv('submission.csv',index=False)