# Titanic model building

In [1]:
from pandas.core.common import SettingWithCopyWarning
import warnings

warnings.filterwarnings("ignore", category=SettingWithCopyWarning)

In [2]:
from loadDataUtils import loadDataUtils

In [3]:
path_train = r'C:\Users\39320\Desktop\myProjects_python\Titanic\data\train.csv'
path_test = r'C:\Users\39320\Desktop\myProjects_python\Titanic\data\test.csv'
data = loadDataUtils(path_train, path_test)

In [4]:
df_train, df_test = data.get_train_and_test()

### Clean data

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
from titanicPreprocessing import preprocess

In [7]:
p = preprocess(df_train.copy(), df_test.copy())
p.do_preprocess()

In [8]:
train, test = p.get_data()

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 39 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PassengerId          889 non-null    int64  
 1   Survived             889 non-null    int64  
 2   Pclass               889 non-null    int64  
 3   Age                  889 non-null    float64
 4   SibSp                889 non-null    int64  
 5   Parch                889 non-null    int64  
 6   Fare                 889 non-null    float64
 7   cabin_multiple       889 non-null    int64  
 8   Sex_female           889 non-null    uint8  
 9   Sex_male             889 non-null    uint8  
 10  Embarked_C           889 non-null    uint8  
 11  Embarked_Q           889 non-null    uint8  
 12  Embarked_S           889 non-null    uint8  
 13  cabin_letter_0       889 non-null    uint8  
 14  cabin_letter_A       889 non-null    uint8  
 15  cabin_letter_B       889 non-null    uin

In [10]:
train_target = train['Survived']
train.drop(columns=['Survived'], inplace=True)

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train, train_target, test_size=0.3, random_state=42)

## Model building

In [12]:
from sklearn.model_selection import cross_val_score

#### Gaussian NB

In [13]:
from sklearn.naive_bayes import GaussianNB

#I usually use Naive Bayes as a baseline for my classification tasks
gnb = GaussianNB()
cv = cross_val_score(gnb,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.68      0.808     0.7983871 0.75      0.75     ]
0.7572774193548387


#### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter = 2000)
cv = cross_val_score(lr,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.832      0.856      0.83064516 0.83870968 0.7983871 ]
0.8311483870967742


In [15]:
lr = LogisticRegression(max_iter = 20000)
cv = cross_val_score(lr,X_train,y_train,cv=10)
print(cv)
print(cv.mean())

[0.87301587 0.77777778 0.83870968 0.85483871 0.87096774 0.79032258
 0.88709677 0.80645161 0.80645161 0.80645161]
0.8312083973374296


#### Deciosion tree

In [18]:
from sklearn import tree

dt = tree.DecisionTreeClassifier(random_state = 1)
cv = cross_val_score(dt,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.728      0.8        0.76612903 0.75       0.73387097]
0.7556


#### k nearest neighbor

In [20]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
cv = cross_val_score(knn,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.544      0.48       0.5483871  0.54032258 0.53225806]
0.5289935483870968


#### random forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=1000, random_state = 1)
cv = cross_val_score(rf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.864      0.88       0.84677419 0.86290323 0.81451613]
0.8536387096774194


#### Support vector machine

In [24]:
from sklearn.svm import SVC

svc = SVC()
cv = cross_val_score(svc,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.616      0.616      0.61290323 0.61290323 0.61290323]
0.614141935483871


#### XGboost

In [28]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
cv = cross_val_score(xgb,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.808      0.856      0.82258065 0.82258065 0.83870968]
0.8295741935483871


#### Voting classifier