In [1]:
pip install kaggle pandas scikit-learn




In [2]:
import os
import json

# Move kaggle.json to ~/.kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Install kaggle package
!pip install kaggle



In [3]:
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

# Download the Titanic dataset
dataset_name = 'heptapod/titanic'
api.dataset_download_files(dataset_name, path='titanic', unzip=True)


Dataset URL: https://www.kaggle.com/datasets/heptapod/titanic


In [5]:
import pandas as pd

# Load the Titanic dataset
df = pd.read_csv('titanic/train.csv')

# Display initial data info
print("Initial Data Information:")
print(df.info())

Initial Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 28 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Passengerid  1309 non-null   int64  
 1   Age          1309 non-null   float64
 2   Fare         1309 non-null   float64
 3   Sex          1309 non-null   int64  
 4   sibsp        1309 non-null   int64  
 5   zero         1309 non-null   int64  
 6   zero.1       1309 non-null   int64  
 7   zero.2       1309 non-null   int64  
 8   zero.3       1309 non-null   int64  
 9   zero.4       1309 non-null   int64  
 10  zero.5       1309 non-null   int64  
 11  zero.6       1309 non-null   int64  
 12  Parch        1309 non-null   int64  
 13  zero.7       1309 non-null   int64  
 14  zero.8       1309 non-null   int64  
 15  zero.9       1309 non-null   int64  
 16  zero.10      1309 non-null   int64  
 17  zero.11      1309 non-null   int64  
 18  zero.12      1309 non-

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [8]:
# Imputation
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)


In [9]:
# Standardization
scaler = StandardScaler()
df_standardized = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df.columns)

In [10]:
# Normalization
normalizer = MinMaxScaler()
df_normalized = pd.DataFrame(normalizer.fit_transform(df_standardized), columns=df.columns)

In [11]:
# Display preprocessed data info
print("\nPreprocessed Data Information:")
print(df_normalized.info())


Preprocessed Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 28 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Passengerid  1309 non-null   float64
 1   Age          1309 non-null   float64
 2   Fare         1309 non-null   float64
 3   Sex          1309 non-null   float64
 4   sibsp        1309 non-null   float64
 5   zero         1309 non-null   float64
 6   zero.1       1309 non-null   float64
 7   zero.2       1309 non-null   float64
 8   zero.3       1309 non-null   float64
 9   zero.4       1309 non-null   float64
 10  zero.5       1309 non-null   float64
 11  zero.6       1309 non-null   float64
 12  Parch        1309 non-null   float64
 13  zero.7       1309 non-null   float64
 14  zero.8       1309 non-null   float64
 15  zero.9       1309 non-null   float64
 16  zero.10      1309 non-null   float64
 17  zero.11      1309 non-null   float64
 18  zero.12      130

In [12]:

# Save the preprocessed data
df_normalized.to_csv('preprocessed_data.csv', index=False)

print("\nPreprocessed data saved to 'preprocessed_data.csv'")


Preprocessed data saved to 'preprocessed_data.csv'
