# Apply Machine Learning

In [1]:
# importing the models we want to use
# importing the score metrics for checking the score of our models and predictions

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# one more check for train and test data before training
train = pd.read_csv("dataset/train_ml.csv")
train = train.drop(columns=["Unnamed: 0"])
train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,SibSp,Parch,Fare,Embarked,age_group
0,0,3,11.0,1.0,1,0,7.25,2.0,4.0
1,1,1,12.0,0.0,1,0,71.2833,0.0,0.0
2,1,3,8.0,0.0,0,0,7.925,2.0,0.0
3,1,1,12.0,0.0,1,0,53.1,2.0,0.0
4,0,3,11.0,1.0,0,0,8.05,2.0,0.0


In [3]:
test = pd.read_csv("dataset/test_ml.csv")
test = test.drop(columns=["Unnamed: 0"])
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,SibSp,Parch,Fare,Embarked,age_group
0,892,3,5.0,1.0,0,0,7.8292,1.0,0.0
1,893,3,6.0,0.0,1,0,7.0,2.0,0.0
2,894,2,5.0,1.0,0,0,9.6875,1.0,3.0
3,895,3,5.0,1.0,0,0,8.6625,2.0,0.0
4,896,3,6.0,0.0,1,1,12.2875,2.0,4.0


In [4]:
# we are gonna use different part of our dataset for training and testing
# here the train data, it only includes feature for training the models
# which are Pclass, name, Sex, SibSp, Parch, Fare, Embarked, age_group
train_X = train.iloc[:,1:]
train_X

Unnamed: 0,Pclass,Name,Sex,SibSp,Parch,Fare,Embarked,age_group
0,3,11.0,1.0,1,0,7.2500,2.0,4.0
1,1,12.0,0.0,1,0,71.2833,0.0,0.0
2,3,8.0,0.0,0,0,7.9250,2.0,0.0
3,1,12.0,0.0,1,0,53.1000,2.0,0.0
4,3,11.0,1.0,0,0,8.0500,2.0,0.0
...,...,...,...,...,...,...,...,...
884,2,14.0,1.0,0,0,13.0000,2.0,0.0
885,1,8.0,0.0,0,0,30.0000,2.0,4.0
886,3,8.0,0.0,1,2,23.4500,2.0,0.0
887,1,11.0,1.0,0,0,30.0000,0.0,0.0


In [5]:
# the other part of our data is testing data
# it only includes survived column for training our models
# these datas will be reference for our model
train_y = train.iloc[:,:1]
train_y

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
884,0
885,1
886,0
887,1


In [6]:
# dividing our training and testing data for making predictions after training
# use train_test_split function comes with sklearn library
X_train,X_test,y_train,y_test=train_test_split(train_X,
                                               train_y,test_size=0.2,random_state=42)

In [7]:
# the first model for trainig is Decision Tree
# these models are good for finding regression relations on data
# for example: Do we have a relationship between fare increase and surviving
# or do we have a relationship between Passenger Class and Survivng

decision_tree_model=DecisionTreeClassifier()
decision_tree_model.fit(X_train,y_train)
pred=decision_tree_model.predict(X_test)
print(decision_tree_model.score(X_train,y_train))
accuracy_score(y_test, pred)


0.9479606188466948


0.7640449438202247

In [8]:
# second model for training is LogisticRegression
# Let's see if we have some logistic regression relations in our data or not

logistic_regression_model=LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train,y_train)
pred=logistic_regression_model.predict(X_test)
print(logistic_regression_model.score(X_train,y_train))
accuracy_score(y_test, pred)


0.7974683544303798


0.7696629213483146

In [9]:
xgb_model=XGBClassifier(n_estimators=50,max_depth=8)
xgb_model.fit(X_train,y_train)
pred=xgb_model.predict(X_test)
print(xgb_model.score(X_train,y_train))
accuracy_score(y_test, pred)

0.9381153305203939


0.8146067415730337

In [10]:
# Since the XGBoost Model has the highest score values
# this model will used for submission
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": xgb_model.predict( test.iloc[:,1:])
})

In [11]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB


In [12]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [13]:
submission.to_csv('dataset/submission.csv', index= False)


_**Enes Cavus**_