In [77]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import numpy as np

# titanic_data = pd.read_csv('train.csv')
# test_data = pd.read_csv('test.csv')

def get_combined_data():
    # reading train data
    train = pd.read_csv('train.csv')
    
    # reading test data
    test = pd.read_csv('test.csv')

    # extracting and then removing the targets from the training data 
    targets = train.Survived
    train.drop(['Survived'], 1, inplace=True)
    

    # merging train data and test data for future feature engineering
    # we'll also remove the PassengerID since this is not an informative feature
    titanic_data = train.append(test)
    titanic_data.reset_index(inplace=True)
    titanic_data.drop(['index', 'PassengerId'], inplace=True, axis=1)
    
    return titanic_data
titanic_data = get_combined_data()

I learned that combining the train and test sets will help to create a better model from https://github.com/ahmedbesbes/How-to-score-0.8134-in-Titanic-Kaggle-Challenge/blob/master/article_1.ipynb

KeyError: 'Survived'

## Cleaning the Code

In [61]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Name      1309 non-null   object 
 2   Sex       1309 non-null   object 
 3   Age       1046 non-null   float64
 4   SibSp     1309 non-null   int64  
 5   Parch     1309 non-null   int64  
 6   Ticket    1309 non-null   object 
 7   Fare      1308 non-null   float64
 8   Cabin     295 non-null    object 
 9   Embarked  1307 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 102.4+ KB


## Filling in the NaN Ages

In [70]:
titanic_data['Age'] = titanic_data['Age'].fillna(titanic_data['Age'].median())

#finding the missing age values using the Sex and Pclass instead of just age
grouped_train = titanic_data.iloc[:891].groupby(['Sex','Pclass'])
grouped_median_train = grouped_train.median()
grouped_median_train = grouped_median_train.reset_index()[['Sex', 'Pclass', 'Age']]

def fill_age(row):
    condition = (
        (grouped_median_train['Sex'] == row['Sex']) &  
        (grouped_median_train['Pclass'] == row['Pclass'])
    ) 
    return grouped_median_train[condition]['Age'].values[0]

def process_age():
    global titanic_data
    # a function that fills the missing values of the Age variable
    titanic_data['Age'] = titanic_data.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)
    return titanic_data

titanic_data = process_age()

#Round age to the ones place
titanic_data['Age'] = titanic_data['Age'].apply(np.floor).astype(int)

#Fare is missing one value so I found the mean and inplaced it
titanic_data['Fare'].fillna(test_data['Fare'].mean(), inplace = True)

I used https://github.com/ahmedbesbes/How-to-score-0.8134-in-Titanic-Kaggle-Challenge/blob/master/article_1.ipynb to figure out the code for finding a more accurate guessed age of the passengers

In [71]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Name      1309 non-null   object 
 2   Sex       1309 non-null   object 
 3   Age       1309 non-null   int64  
 4   SibSp     1309 non-null   int64  
 5   Parch     1309 non-null   int64  
 6   Ticket    1309 non-null   object 
 7   Fare      1309 non-null   float64
 8   Cabin     295 non-null    object 
 9   Embarked  1307 non-null   object 
dtypes: float64(1), int64(4), object(5)
memory usage: 102.4+ KB


In [72]:
titanic_data

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.9250,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1000,C123,S
4,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
1304,3,"Spector, Mr. Woolf",male,28,0,0,A.5. 3236,8.0500,,S
1305,1,"Oliva y Ocana, Dona. Fermina",female,39,0,0,PC 17758,108.9000,C105,C
1306,3,"Saether, Mr. Simon Sivertsen",male,38,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,3,"Ware, Mr. Frederick",male,28,0,0,359309,8.0500,,S


In [73]:
#Changing female and male to 0s and 1s
titanic_data['Sex'] = titanic_data['Sex'].replace('female',1)
titanic_data['Sex'] = titanic_data['Sex'].replace('male',0)
test_data['Sex'] = test_data['Sex'].replace('female',1)
test_data['Sex'] = test_data['Sex'].replace('male',0)

In [74]:
titanic_data

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,7.9250,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,53.1000,C123,S
4,3,"Allen, Mr. William Henry",0,35,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
1304,3,"Spector, Mr. Woolf",0,28,0,0,A.5. 3236,8.0500,,S
1305,1,"Oliva y Ocana, Dona. Fermina",1,39,0,0,PC 17758,108.9000,C105,C
1306,3,"Saether, Mr. Simon Sivertsen",0,38,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,3,"Ware, Mr. Frederick",0,28,0,0,359309,8.0500,,S


In [44]:
#Creating y and X
y = titanic_data.Survived
features = ['Pclass', 'Sex','Age', 'Fare']
X = titanic_data[features]

#Creating the test 
test_X = test_data[features]

#Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

#Specify and fit model
titanic_model = RandomForestRegressor(random_state = 1)
titanic_model.fit(train_X, train_y)

#Make validation predictions and calculate mean absolute error
titanic_val_predictions = titanic_model.predict(val_X)
titanic_val_mae = mean_absolute_error(titanic_val_predictions, val_y)

#titanic_val_mae

#Specify and fit model of test data
full_model = RandomForestRegressor(random_state = 1)
full_model = full_model.fit(X, y)

#Make final predictions and round it to the ones place
final_predictions = full_model.predict(test_X)
final_predictions = final_predictions.round(decimals = 0).astype(int)
final_predictions

array([0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

## Testing XGBoost

In [36]:
# from xgboost import XGBRegressor

# my_model = XGBRegressor(random_state = 1)
# my_model.fit(train_X, train_y)

# #Make validation predictions and calculate mean absolute error
# preds = my_model.predict(val_X)
# titanic_val_mae = mean_absolute_error(preds, val_y)

# titanic_val_mae

# # #Specify and fit model of test data
# # full_model = RandomForestRegressor(random_state = 1)
# # full_model = full_model.fit(X, y)

# # #Make final predictions and round it to the ones place
# # final_predictions = full_model.predict(test_X)
# # final_predictions = final_predictions.round(decimals = 0).astype(int)
# # final_predictions

In [37]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId,
                     'Survived': final_predictions})
output.to_csv('titanic_submission5.csv', index=False) 