In [1]:
import pandas as pd
import numpy as np
import os
import os.path as op
import copy
import numpy.random as rd

In [2]:
path = '../../data/kaggle-titanic'
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
setNames = ['train.csv','test.csv']
data = []
for sn in setNames:
    data.append( pd.read_csv(op.join(path,sn)))
    print(data[-1].head())
    print(data[-1].info())

../data/kaggle-titanic/train_label.csv
../data/kaggle-titanic/test_RescaleClean.csv
../data/kaggle-titanic/train_RescaleClean.csv
../data/kaggle-titanic/test.csv
../data/kaggle-titanic/train.csv
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          P

In [3]:
# fix missing data in training set
train_data = data[0]
mean = train_data["Age"].mean()
std = train_data["Age"].std()

rand_age = rd.randint(mean-std, mean+std, size = 177)
age_slice = train_data["Age"].copy()
age_slice[np.isnan(age_slice)] = rand_age
train_data["Age"] = age_slice

train_data["Embarked"].fillna(value="C", inplace=True)
print(train_data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64


In [4]:
# fix missing data in test set
test_data = data[1]
mean = test_data["Age"].mean()
std = test_data["Age"].std()

rand_age = rd.randint(mean-std, mean+std, size = 86)
age_slice = test_data["Age"].copy()
age_slice[np.isnan(age_slice)] = rand_age
test_data["Age"] = age_slice
test_data["Fare"].fillna(test_data["Fare"].mean(), inplace=True)

print(test_data.isnull().sum())
print(test_data.head())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64
   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0     

In [5]:
# prepare training sessions/map categorial data to numerics
genders = {"male":0, "female":1}
train_data["Sex"] = train_data["Sex"].map(genders)
test_data["Sex"] = test_data["Sex"].map(genders)

ports = {"S":0, "C":1, "Q":2}
train_data["Embarked"] = train_data["Embarked"].map(ports)
test_data["Embarked"] = test_data["Embarked"].map(ports)

In [6]:
def extractTitle(X):
    
    title = []
    for n in X['Name']:
        
        if 'Mr.' in n or 'Mrs.' in n:
            title.append(0)
        elif 'Miss.' or 'Ms.' in n:
            title.append(1)
        else:
            title.append(2)
            
    X['Title'] = title
            
def addBucket(df, col):
    
    bx = pd.cut(df[col], 6, labels=False)
    df[col+'_group'] = bx
    
    
def rescale(df):
    
    z = copy.copy(df)
    for tag in z.columns:
        x = z[tag][:]
        xmin = np.amin(x)
        xmax = np.amax(x)
        z[tag] = (x - xmin) / (xmax - xmin)
    
    return z

y = train_data["Survived"]
features = ["Pclass", "Sex", "SibSp", "Parch", "Fare", "Age", "Embarked", "Name"]
X = train_data[features]
X_test = test_data[features]
print(X.head())

for x in [X, X_test]:
    extractTitle(x)

    addBucket(x, 'Age')
    addBucket(x, 'Fare')

features = ["Pclass", "Sex", "SibSp", "Parch", "Fare_group","Embarked", "Age_group", "Title"]
Z = X[features]
Z_test = X_test[features]
Z = rescale(Z)
Z_test = rescale(Z_test)

# saving the dataframe
y.to_csv(op.join(path,'train_label.csv'))
Z.to_csv(op.join(path,'train_RescaleClean.csv'))
Z_test.to_csv(op.join(path,'test_RescaleClean.csv'))

   Pclass  Sex  SibSp  Parch     Fare   Age  Embarked  \
0       3    0      1      0   7.2500  22.0         0   
1       1    1      1      0  71.2833  38.0         1   
2       3    1      0      0   7.9250  26.0         0   
3       1    1      1      0  53.1000  35.0         0   
4       3    0      0      0   8.0500  35.0         0   

                                                Name  
0                            Braund, Mr. Owen Harris  
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  
2                             Heikkinen, Miss. Laina  
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  
4                           Allen, Mr. William Henry  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Title'] = title
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col+'_group'] = bx
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col+'_group'] = bx
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in