<a href="https://colab.research.google.com/github/edcote/kaggle/blob/main/Titantic_Machine_Learning_from_Disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [67]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'titanic:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F3136%2F26502%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240930%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240930T041946Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D8c0ec92f3ca3f296172a2b38b87d4a5dcaddc133ea43a3292725786859ce631e7aee348f26745fe2f4a9a92f8e3548063f0335a79fa28cecd8e000332c8ebcbc8ce1e90aff0de7fc1d3963fe558f8f84d99e8a4446c8543e69d3d05794aa1f6c7099b1b2422bad9590c646b32d584122daba5963504ff2b337dcbaa79c6e96a40542b9edb257d7da91ed305019eda0a378536fe87d9ba668515ff603e0d53e9e13e3d6d7d4af1b00468a412880c591b611fb274c77c988ddecec03a48b587b903bb5b49f28af2fbf803683ecbf8e677e0aebcbf66ce762cfad64ddd2e7f404ba9046886f9199942c170ef4020a904ce1f0e89af75129b5ad6024450626735601'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading titanic, 34877 bytes compressed
Downloaded and uncompressed: titanic
Data source import complete.


In [68]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [91]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv', index_col='PassengerId')
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna('S', inplace=True)
train_data['Sex'] = train_data['Sex'].map({'female': 0, 'male': 1}).astype(int)
train_data['Embarked_S'] = train_data['Embarked'] == 'S'
train_data['Embarked_C'] = train_data['Embarked'] == 'C'
train_data['Embarked_Q'] = train_data['Embarked'] == 'Q'


train_data['Age Group'] = pd.cut(
    train_data['Age'],
    bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90],
    labels=["0_10", "10_20", "30_40", "40_50", "50_60", "60_70", "70_80", "80_90"]
)


train_data = train_data.drop(['Ticket', 'Name', 'Cabin', 'Embarked'], axis=1)

train_data.Age.describe()

ValueError: Bin labels must be one fewer than the number of bin edges

In [None]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')

#train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
#train_data['Sex'] = train_data['Sex'].map({'female': 0, 'male': 1}).astype(int)
#train_data = train_data.drop(['Ticket', 'Name', 'Cabin'], axis=1)


In [70]:
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [71]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [72]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of men who survived: 0.18890814558058924


# Data cleaning

In [75]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')

#train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
#train_data['Sex'] = train_data['Sex'].map({'female': 0, 'male': 1}).astype(int)
#train_data = train_data.drop(['Ticket', 'Name', 'Cabin'], axis=1)


In [74]:
train_data.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.361582,0.523008,0.381594,32.204208
std,0.486592,0.836071,0.47799,13.019697,1.102743,0.806057,49.693429
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,7.9104
50%,0.0,3.0,1.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,1.0,35.0,1.0,0.0,31.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292
