In [56]:
%pip install matplotlib
%pip install seaborn
%pip install numpy
%pip install sklearn


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × Getting requirements to build wheel did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-

In [57]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [58]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [59]:
# function that will combine train and test set, creating a column 'is_train'
# for later when we separate them back
def concat_df(train, test):
    test['is_train'] = False
    train['is_train'] = True
    combined = pd.concat([train, test], sort=False)
    return combined

# function that will separate the combined df
def separate_df(combined):
    train = combined[combined['is_train'] == True].drop('is_train', axis=1)
    test = combined[combined['is_train'] == False].drop('is_train', axis=1)
    return train, test

In [60]:
# combine test and train data to preprocess the data all together.
df_combined = concat_df(df_train, df_test)

In [61]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12970 entries, 0 to 4276
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12682 non-null  object 
 2   CryoSleep     12660 non-null  object 
 3   Cabin         12671 non-null  object 
 4   Destination   12696 non-null  object 
 5   Age           12700 non-null  float64
 6   VIP           12674 non-null  object 
 7   RoomService   12707 non-null  float64
 8   FoodCourt     12681 non-null  float64
 9   ShoppingMall  12664 non-null  float64
 10  Spa           12686 non-null  float64
 11  VRDeck        12702 non-null  float64
 12  Name          12676 non-null  object 
 13  Transported   8693 non-null   object 
 14  is_train      12970 non-null  bool   
dtypes: bool(1), float64(6), object(8)
memory usage: 1.5+ MB


In [62]:
# If a person is in cryosleep, they would not be able to spend any money so
# we can fill null with 0 for money spent columns and false for VIP column
money_col = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
vip = ['VIP']
condition = (df_combined['CryoSleep'] == True)

df_combined.loc[condition, money_col] = df_combined.loc[condition, money_col].fillna(0)
df_combined.loc[condition, vip] = df_combined.loc[condition, vip].fillna(False)

In [63]:
# If a person have not spent any money, it is likely that person is in cryosleep.
cryo = ['CryoSleep']

# the condition is True if all the money_col values are 0
condition = (df_combined[money_col].eq(0).all(axis=1))

df_combined.loc[condition, cryo] = df_combined.loc[condition, cryo].fillna(True)
df_combined['CryoSleep'].fillna(False, inplace=True)

In [None]:
# since the data for money_col is very skewed, take log and then standardize
# taking log

for column in money_col:
    df_combined[column + '_log'] = np.log1p(df_combined[column])


In [64]:
df_combined.isna().sum()

PassengerId        0
HomePlanet       288
CryoSleep          0
Cabin            299
Destination      274
Age              270
VIP              192
RoomService      170
FoodCourt        180
ShoppingMall     175
Spa              177
VRDeck           177
Name             294
Transported     4277
is_train           0
dtype: int64

In [65]:
# since 'Cabin' column has three components, we can slit them into three columns

cabin_split = df_combined['Cabin'].str.split('/', expand=True)
cabin_split.columns = ['Deck', 'RoomNumber', 'Side']
ndf = pd.concat([df_combined, cabin_split], axis=1)
df_combined = ndf
df_combined.drop(columns=['Cabin'], inplace=True)


In [66]:
group_means = df_combined.groupby(['HomePlanet', 'CryoSleep', 'Destination'])['Age'].mean()
print(group_means)

HomePlanet  CryoSleep  Destination  
Earth       False      55 Cancri e      24.101341
                       PSO J318.5-22    29.400391
                       TRAPPIST-1e      27.525610
            True       55 Cancri e      21.093960
                       PSO J318.5-22    24.065913
                       TRAPPIST-1e      23.274959
Europa      False      55 Cancri e      36.318328
                       PSO J318.5-22    36.307692
                       TRAPPIST-1e      35.037146
            True       55 Cancri e      32.905775
                       PSO J318.5-22    33.562500
                       TRAPPIST-1e      32.797950
Mars        False      55 Cancri e      26.369048
                       PSO J318.5-22    35.140000
                       TRAPPIST-1e      30.036870
            True       55 Cancri e      27.561983
                       PSO J318.5-22    40.533333
                       TRAPPIST-1e      28.753172
Name: Age, dtype: float64


In [67]:
# fill in NA values for Age column

group_means = df_combined.groupby(['CryoSleep', 'HomePlanet', 'Destination'], group_keys=True)['Age'].mean().reset_index(name='Age_mean')
df_combined = df_combined.merge(group_means, on=['CryoSleep', 'HomePlanet', 'Destination'], how='left')
df_combined['Age'] = df_combined['Age'].fillna(df_combined['Age_mean'])
df_combined.drop(columns=['Age_mean'], inplace=True)

In [68]:
pd.pivot_table(df_combined, index='Transported', columns='Deck', values='PassengerId', aggfunc='count')

Deck,A,B,C,D,E,F,G,T
Transported,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,129,207,239,271,563,1565,1238,4
True,127,572,508,207,313,1229,1321,1


In [69]:
df_combined.isna().sum()

PassengerId        0
HomePlanet       288
CryoSleep          0
Destination      274
Age               13
VIP              192
RoomService      170
FoodCourt        180
ShoppingMall     175
Spa              177
VRDeck           177
Name             294
Transported     4277
is_train           0
Deck             299
RoomNumber       299
Side             299
dtype: int64

In [71]:
from sklearn.preprocessing import StandardScaler

In [None]:
# since the data for money_col is very skewed, take log and then standardize
# taking log

for column in money_col:
    df_combined[column + '_log'] = np.log1p(df_combined[column])


In [72]:
# standardization
#scaler = StandardScaler()

money_col_log = ['RoomService_log', 'FoodCourt_log', 'ShoppingMall_log', 'Spa_log', 'VRDeck_log']
#scaled_columns = scaler.fit_transform(df_combined[money_col_log])
#for i, col in enumerate(money_col_log):
#    df_combined[col + '_scaled'] = scaled_columns[:, i]

In [73]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PassengerId       12970 non-null  object 
 1   HomePlanet        12682 non-null  object 
 2   CryoSleep         12970 non-null  bool   
 3   Destination       12696 non-null  object 
 4   Age               12957 non-null  float64
 5   VIP               12778 non-null  object 
 6   RoomService       12800 non-null  float64
 7   FoodCourt         12790 non-null  float64
 8   ShoppingMall      12795 non-null  float64
 9   Spa               12793 non-null  float64
 10  VRDeck            12793 non-null  float64
 11  Name              12676 non-null  object 
 12  Transported       8693 non-null   object 
 13  is_train          12970 non-null  bool   
 14  Deck              12671 non-null  object 
 15  RoomNumber        12671 non-null  object 
 16  Side              12671 non-null  object

In [74]:
# change boolean variable cryosleep to integer values
df_combined['CryoSleep'] = df_combined['CryoSleep'].astype(int)

In [75]:
# perform one-hot encoding for categorical variables that will be used

df_combined = pd.get_dummies(df_combined, columns=['HomePlanet', 'Deck', 'Side', 'Destination'])

In [76]:
df_combined = df_combined.dropna(subset=['Age'])
#df_combined.isna().sum()

PassengerId                     0
CryoSleep                       0
Age                             0
VIP                           192
RoomService                   170
FoodCourt                     180
ShoppingMall                  175
Spa                           177
VRDeck                        177
Name                          294
Transported                  4269
is_train                        0
RoomNumber                    299
RoomService_log               170
FoodCourt_log                 180
ShoppingMall_log              175
Spa_log                       177
VRDeck_log                    177
HomePlanet_Earth                0
HomePlanet_Europa               0
HomePlanet_Mars                 0
Deck_A                          0
Deck_B                          0
Deck_C                          0
Deck_D                          0
Deck_E                          0
Deck_F                          0
Deck_G                          0
Deck_T                          0
Side_P        

In [85]:
roomservice = df_combined.loc[df_combined['RoomService'] != 0]
roomservice = roomservice['RoomService']
df_combined= df_combined['RoomService'].fillna(roomservice.quantile(.25))
df_combined.head()

KeyError: 'RoomService'

In [None]:
# separate the combined df to train and test
df_combined = df_combined.drop(['Name', 'PassengerId', 'RoomNumber'], axis=1)

In [None]:
df_combined = df_combined['Transported'].astype(int)
df_train, df_test = separate_df(df_combined)

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer

In [None]:
X = df_train.drop('Transported', axis=1)
y = df_train['Transported']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=333)

In [None]:
# make pipeline
preprocessor = ColumnTransformer(transformers = [('scaler', StandardScaler(), money_col_log)],
                                                 remainder='passthrough')

pipe = Pipeline([('preprocessor', preprocessor), ('svc', SVC())])

pipe.fit(X_train, y_train).score(X_test, y_test)