# Titanic Competition 

In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.model_selection import cross_val_score

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

## Cleaning the Code

In [2]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Filling in the NaN Ages

In [3]:
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())

#finding the missing age values using the Sex and Pclass instead of just age
grouped_train = train_data.groupby(['Sex','Pclass'])
grouped_median_train = grouped_train.median()
grouped_median_train = grouped_median_train.reset_index()[['Sex', 'Pclass', 'Age']]

def fill_age(row):
    condition = (
        (grouped_median_train['Sex'] == row['Sex']) &  
        (grouped_median_train['Pclass'] == row['Pclass'])
    ) 
    return grouped_median_train[condition]['Age'].values[0]

def process_age():
    global train_data
    # a function that fills the missing values of the Age variable
    train_data['Age'] = train_data.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)
    return train_data

train_data = process_age()

### doing the same thing for test data

In [4]:
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())

#finding the missing age values using the Sex and Pclass instead of just age
grouped_test = test_data.groupby(['Sex','Pclass'])
grouped_median_test = grouped_test.median()
grouped_median_test = grouped_median_test.reset_index()[['Sex', 'Pclass', 'Age']]

def fill_age(row):
    condition = (
        (grouped_median_test['Sex'] == row['Sex']) &  
        (grouped_median_test['Pclass'] == row['Pclass'])
    ) 
    return grouped_median_test[condition]['Age'].values[0]

def process_age():
    global test_data
    # a function that fills the missing values of the Age variable
    test_data['Age'] = test_data.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)
    return test_data

test_data = process_age()

I used https://github.com/ahmedbesbes/How-to-score-0.8134-in-Titanic-Kaggle-Challenge/blob/master/article_1.ipynb to figure out the code for finding a more accurate guessed age of the passenger

## Rounding Age and filling NaN in Fare

In [5]:
#Round age to the ones place
train_data['Age'] = train_data['Age'].apply(np.floor).astype(int)
test_data['Age'] = test_data['Age'].apply(np.floor).astype(int)

#Fare is missing one value so I found the mean and inplaced it
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace = True)


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    int64  
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(1), int64(6), object(5)
memory usage: 83.7+ KB


In [7]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          418 non-null    int64  
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(1), int64(5), object(5)
memory usage: 36.0+ KB


In [8]:
#Changing female and male to 0s and 1s
train_data['Sex'] = train_data['Sex'].replace('female',1)
train_data['Sex'] = train_data['Sex'].replace('male',0)
test_data['Sex'] = test_data['Sex'].replace('female',1)
test_data['Sex'] = test_data['Sex'].replace('male',0)

In [9]:
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",0,27,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",1,19,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,28,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",0,26,0,0,111369,30.0000,C148,C


## Modeling the Data

In [10]:
#Creating y and X
y = train_data.Survived
features = ['Pclass', 'Sex','Age', 'Fare']
X = train_data[features]

#Creating the test 
test_X = test_data[features]

#Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

## Testing RandomForestRegressor

In [11]:
#Specify and fit model
titanic_model = RandomForestRegressor(random_state = 1)
titanic_model.fit(train_X, train_y)

#Make validation predictions and calculate mean absolute error
titanic_val_predictions = titanic_model.predict(val_X)
titanic_val_mae = mean_absolute_error(titanic_val_predictions, val_y)

#titanic_val_mae

cv = cross_val_score(titanic_model, X, y, cv=5)
print(cv.mean())

# #Specify and fit model of test data
# full_model = RandomForestRegressor(random_state = 1)
# full_model = full_model.fit(X, y)

# #Make final predictions and round it to the ones place
# final_predictions = full_model.predict(test_X)
# final_predictions = final_predictions.round(decimals = 0).astype(int)
# final_predictions

0.3553830929205263


## Testing XGBoost

In [24]:
from xgboost import XGBRegressor

#Specify and fit model of test data
my_model = XGBRegressor(random_state = 1)
my_model.fit(train_X, train_y)

#Make validation predictions and calculate mean absolute error
preds = my_model.predict(val_X)
titanic_val_mae = mean_absolute_error(preds, val_y)

#titanic_val_mae

cv = cross_val_score(my_model, X, y, cv=5)
#print(cv.mean())

#Make final predictions and round it to the ones place
final_predictions = my_model.predict(test_X)
final_predictions = final_predictions.round(decimals = 0).astype(int)
# final_predictions

resultsXGB = pd.DataFrame({'Survived': val_y, 'Prediction': preds, 'Correct': val_y == preds})
resultsXGB['Correct'].value_counts(normalize=True)

False    1.0
Name: Correct, dtype: float64

## Trying Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression

# Instantiate our model
logreg = LogisticRegression(random_state = 1)

# Fit our model to the training data
logreg.fit(X, y)

#Make validation predictions and calculate mean absolute error
preds = logreg.predict(val_X)
titanic_val_mae = mean_absolute_error(preds, val_y)
#titanic_val_mae

cv = cross_val_score(logreg, X, y, cv=5)
#print(cv.mean())

# Predict on the test data
logreg_predictions = logreg.predict(test_X)
#logreg_predictions

resultsLG = pd.DataFrame({'Survived': val_y, 'Prediction': preds, 'Correct': val_y == preds})
resultsLG['Correct'].value_counts(normalize=True)

True     0.807175
False    0.192825
Name: Correct, dtype: float64

In [None]:
# output = pd.DataFrame({'PassengerId': test_data.PassengerId,
#                      'Survived': logreg_predictions})
# output.to_csv('titanic_submission9.csv', index=False) 