## Titanic

In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score


from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
traindf = pd.read_csv("train.csv")
testdf = pd.read_csv("test.csv")
genderdf = pd.read_csv("gender_submission.csv")

In [3]:
#kategorik değerler --> numeric değere dönüştürüldü
traindf["Sex"] = traindf["Sex"].map({'male': 1, 'female' : 2})
testdf["Sex"] = testdf["Sex"].map({'male' : 1, 'female' : 2})

traindf["Embarked"] = traindf["Embarked"].map({'S' : 1, 'Q' : 2, 'C' : 3})
testdf["Embarked"] = testdf["Embarked"].map({'S' : 1, 'Q' : 2, 'C' : 3})

In [4]:
#boş değerlere atama yapıldı
traindf["Age"] = traindf["Age"].fillna(traindf["Age"].mean())
testdf["Age"] = testdf["Age"].fillna(testdf["Age"].mean())

traindf["Embarked"] = traindf["Embarked"].fillna(1)

testdf["Fare"] = testdf["Fare"].fillna(testdf["Fare"].mean())

In [5]:
traindf = traindf.drop("Name", axis = 1)
testdf = testdf.drop("Name", axis = 1)

traindf = traindf.drop("Ticket", axis = 1)
testdf = testdf.drop("Ticket", axis = 1)

traindf = traindf.drop("Cabin", axis = 1)
testdf = testdf.drop("Cabin", axis = 1)

In [6]:
X_train = traindf.drop(['PassengerId', 'Survived'], axis = 1).astype('float64')
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3.0,1.0,22.0,1.0,0.0,7.25,1.0
1,1.0,2.0,38.0,1.0,0.0,71.2833,3.0
2,3.0,2.0,26.0,0.0,0.0,7.925,1.0
3,1.0,2.0,35.0,1.0,0.0,53.1,1.0
4,3.0,1.0,35.0,0.0,0.0,8.05,1.0


In [7]:
y_train = traindf[['Survived']]
y_train.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [8]:
X_test = testdf.drop(['PassengerId'], axis = 1).astype('float64')
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3.0,1.0,34.5,0.0,0.0,7.8292,2.0
1,3.0,2.0,47.0,1.0,0.0,7.0,1.0
2,2.0,1.0,62.0,0.0,0.0,9.6875,2.0
3,3.0,1.0,27.0,0.0,0.0,8.6625,1.0
4,3.0,2.0,22.0,1.0,1.0,12.2875,1.0


In [9]:
y_test = genderdf["Survived"]
y_test[0:5]

0    0
1    1
2    0
3    0
4    1
Name: Survived, dtype: int64

In [10]:
from xgboost import XGBClassifier

In [11]:
xgb_model = XGBClassifier().fit(X_train, y_train)



In [12]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8133971291866029

In [13]:
xgb_params = {
        'n_estimators': [100, 500, 1000, 2000],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,6],
        'learning_rate': [0.1,0.01,0.02,0.05],
        "min_samples_split": [2,5,10]}

In [14]:
xgb = XGBClassifier()

xgb_cv_model = GridSearchCV(xgb, xgb_params, cv = 10, n_jobs = -1, verbose = 2)

In [15]:
xgb_cv_model.fit(X_train, y_train)

Fitting 10 folds for each of 576 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   58.0s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 18.7min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 27.0min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 33.4min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 37.6min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 42.2min
[Parallel(n_jobs=-1)]: Done 5760 out of 5760 | elapsed: 47.8min finished


Parameters: { min_samples_split } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constra...
                                     subsample=None, tree_method=None,
                                     use_label_encoder=True,
                                     validate_parameters=None, verbosity=None),
             iid='warn', n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.01, 0.02, 0.05],
      

In [16]:
xgb_cv_model.best_params_

{'learning_rate': 0.02,
 'max_depth': 6,
 'min_samples_split': 2,
 'n_estimators': 100,
 'subsample': 0.8}

In [17]:
xgb = XGBClassifier(learning_rate = 0.02, 
                    max_depth = 6,
                    min_samples_split = 2,
                    n_estimators = 100,
                    subsample = 0.8)

In [18]:
xgb_tuned =  xgb.fit(X_train,y_train)

Parameters: { min_samples_split } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [19]:
y_pred = xgb_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

0.9019138755980861

In [20]:
id = testdf[["PassengerId"]]
id.head()

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896


In [21]:
y_pred = pd.DataFrame(y_pred, columns=["Survived"])
y_pred.head()

Unnamed: 0,Survived
0,0
1,0
2,0
3,0
4,1


In [22]:
sonuc = pd.concat([id, y_pred], axis = 1)

In [23]:
sonuc.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [24]:
sonuc = sonuc.set_index('PassengerId')
sonuc.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [25]:
sonuc.to_csv("TitanicML.csv")