### Версия для подготовки к отправке на Kaggle

In [21]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

In [22]:
def prepare_data( data ):
    # Let prepeare passenger id
    data['PassengerNum'] = data.index
    data_passenger=data['PassengerNum'].str.split('_', n=1, expand=True).rename(columns={0: "GroupNum", 1: "NumInGroup" })
    data_passenger[["GroupNum", "NumInGroup"]] = data_passenger[["GroupNum", "NumInGroup"]].apply(pd.to_numeric)
    # add group number and number in group
    data=data.join(data_passenger,how='left')
    data_groups = data_passenger.groupby(['GroupNum']).count()
    data_groups.rename(columns={'NumInGroup': 'Count'}, inplace=True)
    data = data.join(data_groups, how='left', on='GroupNum')
    data = data.drop('PassengerNum', axis=1).drop('GroupNum', axis=1).drop('NumInGroup', axis=1)
    # CryoSleep - change to 0/1, NaN - replace with False
    pd.set_option('future.no_silent_downcasting', True)
    data.loc[:, 'CryoSleep'] = data.CryoSleep.replace({False: 0, True: 1})
    data.loc[:, 'CryoSleep'] = data.CryoSleep.fillna(True)
    # Cabin - split into three part: Deck Num Side
    data_cabin=data['Cabin'].str.split('/', n=2, expand=True).rename(columns={0: "Deck", 1: "Num", 2: "Side" })
    data=data.join(data_cabin,how='left')
    #Age - OK, NaN replace with smth
    data.loc[:, 'Age'] = data.Age.fillna(100)
    # VIP - change to 0/1, NaN - replace with False
    #with pd.set_option("future.no_silent_downcasting", True):
    data.loc[:, 'VIP'] = data.VIP.replace({False: 0, True: 1})
    data.loc[:, 'VIP'] = data.VIP.infer_objects(copy=False).fillna(False)
    # RoomService - OK, NaN - replace with 0
    # FoodCourt - OK, NaN - replace with 0
    # ShoppingMall - OK, NaN - replace with 0
    # Spa - OK, NaN - replace with 0
    # VRDeck - OK
    data.loc[:, 'RoomService'] = data.RoomService.fillna(0)
    data.loc[:, 'FoodCourt'] = data.FoodCourt.fillna(0)
    data.loc[:, 'ShoppingMall'] = data.ShoppingMall.fillna(0)
    data.loc[:, 'Spa'] = data.Spa.fillna(0)
    data.loc[:, 'VRDeck'] = data.VRDeck.fillna(0)
    data[["CryoSleep", "VIP"]] = data[["CryoSleep", "VIP"]].apply(pd.to_numeric)
    # Add name length
    namelendf = data['Name'].str.len()
    data = data.join(namelendf.rename('NameLen'), how='left')
    data.loc[:, 'NameLen'] = data.NameLen.fillna(0)
    # Drop column
    # Name  - unique field
    # Num  - 1817 unique values, so it's almost unique field
    data = data.drop('Name', axis=1).drop('Num', axis=1).drop('Cabin', axis=1)
    #Encoding
    categorical_feature = ['HomePlanet', 'Destination', 'Deck', 'Side']
    enc = OrdinalEncoder()
    data[categorical_feature] = enc.set_params(encoded_missing_value=-1).fit_transform(data[categorical_feature])
    return data

In [30]:
# Load train data
train_data = pd.read_csv('train.csv', index_col=0)
train_data=prepare_data(train_data)
X_train = train_data.drop('Transported', axis=1)
y_train = train_data['Transported']

# Load test data
test_data = pd.read_csv('test.csv', index_col=0)
X_test =prepare_data(test_data)
X_test

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Count,Deck,Side,NameLen
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0013_01,0.0,1,2.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,1,6.0,1.0,15.0
0018_01,0.0,0,2.0,19.0,0.0,0.0,9.0,0.0,2823.0,0.0,1,5.0,1.0,14.0
0019_01,1.0,1,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2.0,1.0,15.0
0021_01,1.0,0,2.0,38.0,0.0,0.0,6652.0,0.0,181.0,585.0,1,2.0,1.0,16.0
0023_01,0.0,0,2.0,20.0,0.0,10.0,0.0,635.0,0.0,0.0,1,5.0,1.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,0.0,1,2.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,2,6.0,1.0,11.0
9269_01,0.0,0,2.0,42.0,0.0,0.0,847.0,17.0,10.0,144.0,1,-1.0,-1.0,13.0
9271_01,2.0,1,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,1,3.0,0.0,11.0
9273_01,1.0,0,-1.0,100.0,0.0,0.0,2680.0,0.0,0.0,523.0,1,3.0,0.0,14.0


In [31]:
#Scaling

scaler = StandardScaler()
X_train_scaled =  scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


array([[ 0.451063  , -0.77348028,  0.64162884, ..., -1.65102842,
        -0.86617419,  0.45572902],
       [-0.75728543, -0.77348028,  0.64162884, ...,  0.42316625,
         0.9752668 , -0.46515717],
       [ 0.451063  , -0.77348028,  0.64162884, ..., -2.16957708,
         0.9752668 , -0.1581951 ],
       ...,
       [-0.75728543, -0.77348028,  0.64162884, ...,  0.94171492,
         0.9752668 , -0.46515717],
       [ 0.451063  , -0.77348028, -1.61562713, ..., -0.09538241,
         0.9752668 ,  0.76269109],
       [ 0.451063  , -0.77348028,  0.64162884, ..., -0.09538241,
         0.9752668 ,  0.76269109]])

In [None]:

model = GradientBoostingClassifier(n_estimators=300, max_depth=2)
model.fit(X_train_scaled, y_train)
y_test_predict= model.predict(X_test_scaled)


In [43]:
result = pd.DataFrame(index=X_test.index)
result['Transported']= y_test_predict
result.to_csv('result.csv')

