In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.preprocessing import LabelEncoder

In [37]:
data = pd.read_csv('train.csv')
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [38]:
x = data[['Pclass','Sex','Age','Fare']]
y = data['Survived']

In [39]:
x.isna().sum()

Pclass      0
Sex         0
Age       177
Fare        0
dtype: int64

In [40]:
x['Age'] = x['Age'].fillna(x['Age'].mean()) 
x.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Age'] = x['Age'].fillna(x['Age'].mean())


Pclass    0
Sex       0
Age       0
Fare      0
dtype: int64

In [41]:
y.isna().sum()

0

In [42]:
x.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.25
1,1,female,38.0,71.2833
2,3,female,26.0,7.925
3,1,female,35.0,53.1
4,3,male,35.0,8.05


In [43]:
le = LabelEncoder()
x['Sex_n'] = le.fit_transform(x.Sex)
x.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Sex_n'] = le.fit_transform(x.Sex)


Unnamed: 0,Pclass,Sex,Age,Fare,Sex_n
0,3,male,22.0,7.25,1
1,1,female,38.0,71.2833,0
2,3,female,26.0,7.925,0
3,1,female,35.0,53.1,0
4,3,male,35.0,8.05,1


In [45]:
x = x.drop('Sex',axis='columns')
x.head()

Unnamed: 0,Pclass,Age,Fare,Sex_n
0,3,22.0,7.25,1
1,1,38.0,71.2833,0
2,3,26.0,7.925,0
3,1,35.0,53.1,0
4,3,35.0,8.05,1


In [67]:
model_params = {
    'svm':{
        'model': SVC(),
        'params': {
            'C':[1,10,20,50,100],
            'kernel':['rbf','linear']
        }
    },
    'LogisticRegression' : {
        'model': LogisticRegression(),
        'params':{
            'C':[1,5,10]
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params' :{
            'n_estimators':[1,5,10,20]
        }
    },
    'Decision Tree':{
        'model': DecisionTreeClassifier(),
        'params':{
            'criterion':['gini','entropy']
        }
    },
    'MultinomialNB':{
        'model': MultinomialNB(),
        'params':{}
    },
    'GaussianNB':{
        'model': GaussianNB(),
        'params':{}
    }
}

In [61]:
score = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'],mp['params'],cv=5,return_train_score=False)
    clf.fit(x,y)
    score.append({
        'model':model_name,
        'best_score':clf.best_score_,
        'best_param':clf.best_params_
    })
df = pd.DataFrame(score,columns=['model','best_score','best_param'])
df

Unnamed: 0,model,best_score,best_param
0,svm,0.801368,"{'C': 50, 'kernel': 'rbf'}"
1,LogisticRegression,0.786743,{'C': 10}
2,RandomForest,0.804733,{'n_estimators': 5}
3,Decision Tree,0.78233,{'criterion': 'gini'}
4,MultinomialNB,0.649846,{}
5,GaussianNB,0.767742,{}


In [55]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x)
x = scaler.transform(x)

In [70]:
score_s = []

for model_name, mp in model_params.items():
    clf_s = GridSearchCV(mp['model'],mp['params'],cv=5,return_train_score=False)
    clf_s.fit(x,y)
    score_s.append({
        'model':model_name,
        'best_score':clf_s.best_score_,
        'best_param':clf_s.best_params_
    })
df_s = pd.DataFrame(score_s,columns=['model','best_score','best_param'])
df_s

Unnamed: 0,model,best_score,best_param
0,svm,0.806974,"{'C': 100, 'kernel': 'rbf'}"
1,LogisticRegression,0.786743,{'C': 10}
2,RandomForest,0.812617,{'n_estimators': 20}
3,Decision Tree,0.775607,{'criterion': 'gini'}
4,MultinomialNB,0.649846,{}
5,GaussianNB,0.767742,{}


In [72]:
rf = RandomForestClassifier(n_estimators=20,random_state=10)
rf.fit(x,y)

RandomForestClassifier(n_estimators=20, random_state=10)

In [73]:
test_data = pd.read_csv('test.csv')
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [74]:
x_test = test_data[['Pclass','Sex','Age','Fare']]

In [81]:
x_test['Sex_n'] = le.fit_transform(x_test['Sex'])
x_test = x_test.drop('Sex',axis='columns')
x_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test['Sex_n'] = le.fit_transform(x_test['Sex'])


Unnamed: 0,Pclass,Age,Fare,Sex_n
0,3,34.5,7.8292,1
1,3,47.0,7.0,0
2,2,62.0,9.6875,1
3,3,27.0,8.6625,1
4,3,22.0,12.2875,0


In [83]:
x_test.isna().sum()

Pclass     0
Age       86
Fare       1
Sex_n      0
dtype: int64

In [85]:
x_test.Age = x_test.Age.fillna(x_test.Age.mean())
x_test.Fare = x_test.Fare.fillna(x_test.Fare.mean())

In [113]:
scaler.fit(x_test)
x_test = scaler.transform(x_test)

In [114]:
y_pred = rf.predict(x_test)

In [106]:
df = pd.DataFrame(data=test_data['PassengerId'])

In [115]:
df['Survived'] = y_pred

In [120]:
df.to_csv('submission.csv',index=False)