In [1]:
%pip install matplotlib
%pip install seaborn
%pip install numpy
%pip install sklearn


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × Getting requirements to build wheel did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [4]:
# function that will combine train and test set, creating a column 'is_train'
# for later when we separate them back
def concat_df(train, test):
    test['is_train'] = False
    train['is_train'] = True
    combined = pd.concat([train, test], sort=False)
    return combined

# function that will separate the combined df
def separate_df(combined):
    train = combined[combined['is_train'] == True].drop('is_train', axis=1)
    test = combined[combined['is_train'] == False].drop('is_train', axis=1)
    return train, test

In [5]:
# combine test and train data to preprocess the data all together.
df_combined = concat_df(df_train, df_test)

In [6]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12970 entries, 0 to 4276
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12682 non-null  object 
 2   CryoSleep     12660 non-null  object 
 3   Cabin         12671 non-null  object 
 4   Destination   12696 non-null  object 
 5   Age           12700 non-null  float64
 6   VIP           12674 non-null  object 
 7   RoomService   12707 non-null  float64
 8   FoodCourt     12681 non-null  float64
 9   ShoppingMall  12664 non-null  float64
 10  Spa           12686 non-null  float64
 11  VRDeck        12702 non-null  float64
 12  Name          12676 non-null  object 
 13  Transported   8693 non-null   object 
 14  is_train      12970 non-null  bool   
dtypes: bool(1), float64(6), object(8)
memory usage: 1.5+ MB


In [7]:
# If a person is in cryosleep, they would not be able to spend any money so
# we can fill null with 0 for money spent columns and false for VIP column
money_col = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
vip = ['VIP']
condition = (df_combined['CryoSleep'] == True)

df_combined.loc[condition, money_col] = df_combined.loc[condition, money_col].fillna(0)
df_combined.loc[condition, vip] = df_combined.loc[condition, vip].fillna(False)

In [8]:
# If a person have not spent any money, it is likely that person is in cryosleep.
cryo = ['CryoSleep']

# the condition is True if all the money_col values are 0
condition = (df_combined[money_col].eq(0).all(axis=1))

df_combined.loc[condition, cryo] = df_combined.loc[condition, cryo].fillna(True)
df_combined['CryoSleep'].fillna(False, inplace=True)

In [9]:
# compute mode of HomePlanet, Destination and impute it for null

mode_homeplanet = df_combined['HomePlanet'].mode()[0]
mode_destination = df_combined['Destination'].mode()[0]

df_combined['HomePlanet'].fillna(mode_homeplanet, inplace=True)
df_combined['Destination'].fillna(mode_destination, inplace=True)

In [10]:
# change boolean variable cryosleep to integer values
df_combined['CryoSleep'] = df_combined['CryoSleep'].astype(int)

In [11]:
# compute 1st IQR to replace null values in money_col

cryo_sleep_1 = df_combined[df_combined['CryoSleep'] == 1]
first_quartiles = cryo_sleep_1[money_col].quantile(0.25)
print(first_quartiles['FoodCourt'])
for col in money_col:
    df_combined[col].fillna(first_quartiles[col], inplace=True)

df_combined.isna().sum()

0.0


PassengerId        0
HomePlanet         0
CryoSleep          0
Cabin            299
Destination        0
Age              270
VIP              192
RoomService        0
FoodCourt          0
ShoppingMall       0
Spa                0
VRDeck             0
Name             294
Transported     4277
is_train           0
dtype: int64

In [12]:
# group_means = df_combined.groupby(['HomePlanet', 'CryoSleep', 'Destination'])['Age'].mean()
# print(group_means)

In [13]:
# For Cabin and Age, input 'Z/0/N' and 'Unknown' just to use the data without deleting the row

df_combined['Cabin'] = df_combined['Cabin'].fillna('Z/0/N')
df_combined['Name'] = df_combined['Name'].fillna('Unknown')

In [14]:
# since 'Cabin' column has three components, we can slit them into three columns

cabin_split = df_combined['Cabin'].str.split('/', expand=True)
cabin_split.columns = ['Deck', 'RoomNumber', 'Side']
ndf = pd.concat([df_combined, cabin_split], axis=1)
df_combined = ndf
df_combined.drop(columns=['Cabin'], inplace=True)


In [15]:
# fill in NA values for Age column

group_means = df_combined.groupby(['CryoSleep', 'HomePlanet', 'Destination'], group_keys=True)['Age'].mean().reset_index(name='Age_mean')
df_combined = df_combined.merge(group_means, on=['CryoSleep', 'HomePlanet', 'Destination'], how='left')
df_combined['Age'] = df_combined['Age'].fillna(df_combined['Age_mean'])
df_combined.drop(columns=['Age_mean'], inplace=True)

In [None]:
# group_mean_hpna = df_combined.groupby(['CryoSleep', 'Destination'], group_keys=True)['Age'].mean().reset_index(name='Age_mean_hpna')
# group_mean_dtna = df_combined.groupby(['CryoSleep', 'HomePlanet'], group_keys=True)['Age'].mean().reset_index(name='Age_mean_dtna')

# df_combined = df_combined.merge(group_mean_hpna, on=['CryoSleep', 'Destination'], how='left')
# df_combined = df_combined.merge(group_mean_dtna, on=['CryoSleep', 'HomePlanet'], how='left')

# df_combined['Age'] = df_combined.apply(
#     lambda row: row['Age_mean_hpna'] if pd.isnull(row['HomePlanet']) and not pd.isnull(row['Age_mean_hpna']) 
#     else row['Age_mean_dtna'] if pd.isnull(row['Destination']) and not pd.isnull(row['Age_mean_dtna']) 
#     else row['Age'],
#     axis=1
# )

# df_combined.drop(columns=['Age_mean_hpna', 'Age_mean_dtna'], inplace=True)

In [17]:
# To handle null values in VIP column, first made a new column 'TotalSpent' adding up all the money spent

df_combined['TotalSpent'] = df_combined[money_col].sum(axis=1)

In [29]:
# find the minimum value of total spent when vip = 1

vip = df_combined[df_combined['VIP'] == 1]
#vip = df_combined[df_combined['VIP'] == 0]
min_vip = vip['TotalSpent']
min_vip.describe()

count      273.000000
mean      4595.542125
std       5464.818112
min          0.000000
25%       1299.000000
50%       2743.000000
75%       6206.000000
max      33666.000000
Name: TotalSpent, dtype: float64

In [30]:
# for null vip, let's assume if a person spent over 2743 dollars, they are vips

df_combined.loc[df_combined['VIP'].isnull(), 'VIP'] = df_combined['TotalSpent'] >= 2743

In [31]:
df_combined.isna().sum()

PassengerId        0
HomePlanet         0
CryoSleep          0
Destination        0
Age                0
VIP                0
RoomService        0
FoodCourt          0
ShoppingMall       0
Spa                0
VRDeck             0
Name               0
Transported     4277
is_train           0
Deck               0
RoomNumber         0
Side               0
TotalSpent         0
dtype: int64

In [32]:
pd.pivot_table(df_combined, index='Transported', columns='Deck', values='PassengerId', aggfunc='count')

Deck,A,B,C,D,E,F,G,T,Z
Transported,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
False,129,207,239,271,563,1565,1238,4,99
True,127,572,508,207,313,1229,1321,1,100


In [38]:
# since the data for money_col is very skewed, take log and then standardize
# taking log

for column in money_col:
   df_combined[column + '_log'] = np.log1p(df_combined[column])
   df_combined.drop(column, axis=1, inplace=True)

In [35]:
# perform one-hot encoding for categorical variables that will be used, and change bool to int

df_combined = pd.get_dummies(df_combined, columns=['HomePlanet', 'Deck', 'Side', 'Destination'])
df_combined['VIP'] = df_combined['VIP'].astype(int)

In [39]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 32 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PassengerId                12970 non-null  object 
 1   CryoSleep                  12970 non-null  int32  
 2   Age                        12970 non-null  float64
 3   VIP                        12970 non-null  int32  
 4   Name                       12970 non-null  object 
 5   Transported                8693 non-null   object 
 6   is_train                   12970 non-null  bool   
 7   RoomNumber                 12970 non-null  object 
 8   TotalSpent                 12970 non-null  float64
 9   RoomService_log            12970 non-null  float64
 10  FoodCourt_log              12970 non-null  float64
 11  ShoppingMall_log           12970 non-null  float64
 12  Spa_log                    12970 non-null  float64
 13  VRDeck_log                 12970 non-null  flo

In [None]:
# standardization

#scaler = StandardScaler()
#
#money_col_log = ['RoomService_log', 'FoodCourt_log', 'ShoppingMall_log', 'Spa_log', 'VRDeck_log']
#scaled_columns = scaler.fit_transform(df_combined[money_col_log])
#for i, col in enumerate(money_col_log):
#    df_combined[col + '_scaled'] = scaled_columns[:, i]

In [56]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif

In [49]:
# separate the combined df to train and test

df_combined = df_combined.drop(['Name', 'PassengerId', 'RoomNumber'], axis=1)
df_train, df_test = separate_df(df_combined)
df_train['Transported'] = df_train['Transported'].astype(int)

In [50]:
X = df_train.drop('Transported', axis=1)
y = df_train['Transported']

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=333)

In [54]:
# make pipeline

pipe = Pipeline([('scaler', StandardScaler()),
                 ('svm', SVC())])
pipe.fit(X_train, y_train).score(X_test, y_test)

0.7918343875790684

In [62]:
pipe_k5 = Pipeline([('Feature_Selection', SelectKBest(f_classif, k=16)),
                    ('scaler', StandardScaler()),
                    ('svm', SVC())])
pipe_k5.fit(X_train, y_train).score(X_test, y_test)

0.7987349051178838

In [63]:
pipe_rf = Pipeline([('scaler', StandardScaler()),
                    ('rf', RandomForestClassifier())])
pipe.fit(X_train, y_train).score(X_test, y_test)

0.7918343875790684