In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
# Load the dataset
train_csv_path = 'train.csv'
df = pd.read_csv(train_csv_path)

# Splitting the PassengerId into group_num and ppl_num
df[['group_num', 'ppl_num']] = df['PassengerId'].str.split('_', expand=True)

# Finding the maximum number of people per group
group_max = df.groupby('group_num')['ppl_num'].max().reset_index(name='max_num')
df = pd.merge(df, group_max, on='group_num', how='left')
df['max_num'] = df['max_num'].astype(int)

# Generating a 'ticket' feature
df['ticket'] = df['HomePlanet'] + df['Destination']

# Getting dummies
df = pd.get_dummies(df, columns=['ticket'])

# Splitting the Cabin into class, row, and side
df[['cabin_class', 'cabin_row', 'cabin_side']] = df['Cabin'].str.split('/', expand=True)

# Function to fill null values in HomePlanet and Destination based on group_num
def fill_based_on_group(df, column):
    # Sort by group_num for efficient forward/backward filling within groups
    df = df.sort_values(by=['group_num', 'ppl_num'])
    # Forward fill and backward fill within each group
    df[column] = df.groupby('group_num')[column].ffill().bfill()
    # For any remaining, fill with the previous row's value
    df[column] = df[column].fillna(method='ffill')
    return df

# Filling nulls in cabin information using the same method as for HomePlanet and Destination
df = fill_based_on_group(df, 'cabin_class')
df = fill_based_on_group(df, 'cabin_row')
df = fill_based_on_group(df, 'cabin_side')

# Fill VIP nulls with False
df['VIP'] = df['VIP'].fillna(False)

df.dropna(inplace = True)

# Creating age bins
df['AgeBin'] = pd.cut(df['Age'], bins=[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100], right=False)

# Amenities columns to fill NA based on age bins
amenities = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Calculate median for each age bin and amenity
amenities_median = df.groupby('AgeBin')[amenities].median()

# Function to fill NA in amenities based on age bin
def fill_amenities_na(row, amenities, amenities_median):
    for amenity in amenities:
        if pd.isna(row[amenity]):
            if row['CryoSleep'] == True:
                fill_value = 0
            else:
                age_bin = row['AgeBin']
                fill_value = amenities_median.loc[age_bin, amenity]
            row[amenity] = fill_value
    return row

# Apply the function to fill NA values in amenities
df = df.apply(lambda row: fill_amenities_na(row, amenities, amenities_median), axis=1)

# Drop the temporary AgeBin column
df.drop('AgeBin', axis=1, inplace=True)
df['sum_expense'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

cabin_class_dummies = pd.get_dummies(df['cabin_class'], prefix='cabin_class')
cabin_side_dummies = pd.get_dummies(df['cabin_side'], prefix='cabin_side')

# Concatenating the dummy columns to the original dataframe
df = pd.concat([df, cabin_class_dummies, cabin_side_dummies], axis=1)

# Dropping the original 'cabin_class' and 'cabin_side' columns as corrected
df.drop('cabin_side', axis=1, inplace=True)
# Show the processed DataFrame structure
df.columns.values


  df[column] = df[column].fillna(method='ffill')
  df[column] = df[column].fillna(method='ffill')
  df[column] = df[column].fillna(method='ffill')
  amenities_median = df.groupby('AgeBin')[amenities].median()


array(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination',
       'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa',
       'VRDeck', 'Name', 'Transported', 'group_num', 'ppl_num', 'max_num',
       'ticket_Earth55 Cancri e', 'ticket_EarthPSO J318.5-22',
       'ticket_EarthTRAPPIST-1e', 'ticket_Europa55 Cancri e',
       'ticket_EuropaPSO J318.5-22', 'ticket_EuropaTRAPPIST-1e',
       'ticket_Mars55 Cancri e', 'ticket_MarsPSO J318.5-22',
       'ticket_MarsTRAPPIST-1e', 'cabin_class', 'cabin_row',
       'sum_expense', 'cabin_class_A', 'cabin_class_B', 'cabin_class_C',
       'cabin_class_D', 'cabin_class_E', 'cabin_class_F', 'cabin_class_G',
       'cabin_class_T', 'cabin_side_P', 'cabin_side_S'], dtype=object)

In [3]:
# use
columns = ['PassengerId', 'HomePlanet','Cabin', 'Destination','Name','cabin_class']

In [4]:
df_01 = df.drop(columns, axis = 1)

### convert type to numerical

In [5]:
df_01 = df_01.astype({col: 'int' for col in df_01.select_dtypes('bool').columns})

for col in df_01.columns:
    if df_01[col].dtype == 'object':
        df_01[col] = pd.to_numeric(df_01[col])

# try feature

In [6]:
df_try_feature = df_01

In [7]:
df_try_feature['AgeBin'] = pd.cut(df_try_feature['Age'], bins=[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100], right=False)
df_try_feature['age_bin'] = df_try_feature['AgeBin'].astype(str)
age_bin_dummies = pd.get_dummies(df_try_feature['age_bin'], prefix='age_bin')
df_try_feature = pd.concat([df_try_feature, age_bin_dummies], axis=1)
df_try_feature["cabin_class"] = df["cabin_class"]
df_try_feature["cabin_max"] = df_try_feature["cabin_class"] + df_try_feature['max_num'].astype(str)
cabin_max_bin_dummies = pd.get_dummies(df_try_feature['cabin_max'], prefix='cabin_max')
df_try_feature = pd.concat([df_try_feature, cabin_max_bin_dummies], axis=1)

df_try_feature.drop('AgeBin', axis=1, inplace=True)
df_try_feature.drop('age_bin', axis=1, inplace=True)
df_try_feature.drop('cabin_class', axis=1, inplace=True)
df_try_feature.drop('cabin_max', axis=1, inplace=True)


In [8]:
df_try_feature = df_try_feature.astype({col: 'int' for col in df_try_feature.select_dtypes('bool').columns})

for col in df_try_feature.columns:
    if df_try_feature[col].dtype == 'object':
        df_try_feature[col] = pd.to_numeric(df_try_feature[col])

# Train Test Split 

try

In [9]:
X = df_try_feature.drop('Transported', axis=1) 
y = df_try_feature['Transported']  

X_traint, X_testt, y_traint, y_testt = train_test_split(X, y, test_size=0.2, random_state=42)

# TRAINNING

# voting tune

In [38]:
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

bagging_base = BaggingClassifier(
    estimator=RandomForestClassifier(max_depth=40),  
    max_features=1.0,  
    max_samples=1.0,  
    n_estimators=50,  
    random_state=200
)

gb_base = GradientBoostingClassifier(
    learning_rate=0.1,  
    max_depth=4,  
    min_samples_split=6,  
    n_estimators=200,  
    subsample=0.8,  
    random_state=200
)
rf_base = RandomForestClassifier(
    max_depth=None, 
    min_samples_leaf=3,  
    min_samples_split=9,  
    n_estimators=650, 
    random_state=42
)
ada_base = AdaBoostClassifier(
    estimator=RandomForestClassifier(max_depth=3), 
    learning_rate = 0.1, n_estimators = 100)

voting_classifier = VotingClassifier(
    estimators=[
        ('rf', rf_base),
        ('gd', gb_base),
        ('ada',ada_base)
    ],
    voting='hard' 
)

#X_traint, X_testt, y_traint, y_testt

voting_classifier.fit(X_traint, y_traint)
voting_pred = voting_classifier.predict(X_testt)
voting_accuracy = accuracy_score(y_testt, voting_pred)
print("Voting Classifier Accuracy:", voting_accuracy)

Voting Classifier Accuracy: 0.8243542435424355


## new voting grid search

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV

### Define classifiers
bagging_clf = BaggingClassifier(random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
gb_clf = GradientBoostingClassifier(random_state=42)
ada_clf = AdaBoostClassifier(random_state=42, algorithm='SAMME')

### Define a VotingClassifier with soft voting
voting_clf = VotingClassifier(estimators=[
    ('bagging', bagging_clf),
    ('rf', rf_clf),
    ('gb', gb_clf),
    ('ada', ada_clf)
], voting='soft')

### Parameter grids for each classifier
param_grid = {
    'bagging__n_estimators': [10, 50],
    'bagging__max_samples': [0.5, 1.0],
    'bagging__max_features': [0.5, 1.0],
    'rf__n_estimators': [10, 50],
    'rf__max_depth': [3, None],
    'rf__min_samples_split': [2, 3],
    'gb__n_estimators': [50, 100],
    'gb__learning_rate': [0.01, 0.1],
    'gb__max_depth': [3, 5],
    'ada__n_estimators': [50, 100],
    'ada__learning_rate': [0.01, 0.1]
}

### GridSearchCV
grid_search = GridSearchCV(estimator=voting_clf, param_grid=param_grid, cv=3, scoring='accuracy')

### Fit the models
grid_search.fit(X_traint, y_traint)

### Best parameters
print("Best parameters for Voting Classifier:", grid_search.best_params_)

### Best score
print("Best score for Voting Classifier:", grid_search.best_score_)

voting_predictions = grid_search.best_estimator_.predict(X_testt)
voting_accuracy = accuracy_score(y_testt, voting_predictions)
print(f"Accuracy for Voting Classifier: {voting_accuracy}")

# test answer 

In [39]:
# Load the dataset
new_data = pd.read_csv('test_clean.csv')
df = new_data.iloc[:,:-8]
#test_csv_path = 'test.csv'
#df = pd.read_csv(test_csv_path)

# Splitting the PassengerId into group_num and ppl_num
df[['group_num', 'ppl_num']] = df['PassengerId'].str.split('_', expand=True)

# Finding the maximum number of people per group
group_max = df.groupby('group_num')['ppl_num'].max().reset_index(name='max_num')
df = pd.merge(df, group_max, on='group_num', how='left')
df['max_num'] = df['max_num'].astype(int)

# Function to fill null values in HomePlanet and Destination based on group_num
def fill_based_on_group(df, column):
    # Sort by group_num for efficient forward/backward filling within groups
    df = df.sort_values(by=['group_num', 'ppl_num'])
    # Forward fill and backward fill within each group
    df[column] = df.groupby('group_num')[column].ffill().bfill()
    # For any remaining, fill with the previous row's value
    df[column] = df[column].fillna(method='ffill')
    return df

# Filling nulls in HomePlanet and Destination
df = fill_based_on_group(df, 'HomePlanet')
df = fill_based_on_group(df, 'Destination')

# Generating a 'ticket' feature
df['ticket'] = df['HomePlanet'] + df['Destination']

# Getting dummies
df = pd.get_dummies(df, columns=['ticket'])

# Splitting the Cabin into class, row, and side
df[['cabin_class', 'cabin_row', 'cabin_side']] = df['Cabin'].str.split('/', expand=True)

# Filling nulls in cabin information using the same method as for HomePlanet and Destination
df = fill_based_on_group(df, 'cabin_class')
df = fill_based_on_group(df, 'cabin_row')
df = fill_based_on_group(df, 'cabin_side')

# Fill VIP nulls with False
df['VIP'] = df['VIP'].fillna(False)

# Fill Age nulls with a random int between 0 and 65
np.random.seed(666)  # For reproducibility

df['Age'] = df['Age'].fillna(pd.Series(np.random.randint(0, 67, size=len(df))))


# df['CryoSleep'] = df['CryoSleep'].fillna(df['Transported'])
df['sum_expense'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
df['CryoSleep'] = df['CryoSleep'].fillna(df['sum_expense'] == 0)

# Fill CrpoSleep with False where expense is not 0
df['CryoSleep'] = df['CryoSleep'].fillna(False)

# Creating age bins
df['AgeBin'] = pd.cut(df['Age'], bins=[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100], right=False)

# Amenities columns to fill NA based on age bins
amenities = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Calculate median for each age bin and amenity
amenities_median = df.groupby('AgeBin')[amenities].median()

# Function to fill NA in amenities based on age bin
def fill_amenities_na(row, amenities, amenities_median):
    for amenity in amenities:
        if pd.isna(row[amenity]):
            if row['CryoSleep'] == True:
                fill_value = 0
            else:
                age_bin = row['AgeBin']
                fill_value = amenities_median.loc[age_bin, amenity]
            row[amenity] = fill_value
    return row

# Apply the function to fill NA values in amenities
df = df.apply(lambda row: fill_amenities_na(row, amenities, amenities_median), axis=1)

# Drop the temporary AgeBin column
df.drop('AgeBin', axis=1, inplace=True)
df['sum_expense'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

cabin_class_dummies = pd.get_dummies(df['cabin_class'], prefix='cabin_class')
cabin_side_dummies = pd.get_dummies(df['cabin_side'], prefix='cabin_side')

# Concatenating the dummy columns to the original dataframe
df = pd.concat([df, cabin_class_dummies, cabin_side_dummies], axis=1)

# Dropping the original 'cabin_class' and 'cabin_side' columns as corrected
df.drop('cabin_side', axis=1, inplace=True)
# Show the processed DataFrame structure

# use
columns = ['PassengerId', 'HomePlanet','Cabin', 'Destination','Name','cabin_class']
test = df.drop(columns, axis = 1)
### convert type to numerical
test = test.astype({col: 'int' for col in test.select_dtypes('bool').columns})

for col in test.columns:
    if test[col].dtype == 'object':
        test[col] = pd.to_numeric(test[col])
df_try_feature = test
df_try_feature['AgeBin'] = pd.cut(df_try_feature['Age'], bins=[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100], right=False)
df_try_feature['age_bin'] = df_try_feature['AgeBin'].astype(str)
age_bin_dummies = pd.get_dummies(df_try_feature['age_bin'], prefix='age_bin')
df_try_feature = pd.concat([df_try_feature, age_bin_dummies], axis=1)
df_try_feature["cabin_class"] = df["cabin_class"]
df_try_feature["cabin_max"] = df_try_feature["cabin_class"] + df_try_feature['max_num'].astype(str)
cabin_age_bin_dummies = pd.get_dummies(df_try_feature['cabin_max'], prefix='cabin_max')
df_try_feature = pd.concat([df_try_feature, cabin_age_bin_dummies], axis=1)

df_try_feature.drop('AgeBin', axis=1, inplace=True)
df_try_feature.drop('age_bin', axis=1, inplace=True)
df_try_feature.drop('cabin_class', axis=1, inplace=True)
df_try_feature.drop('cabin_max', axis=1, inplace=True)

df_try_feature = df_try_feature.astype({col: 'int' for col in df_try_feature.select_dtypes('bool').columns})

for col in df_try_feature.columns:
    if df_try_feature[col].dtype == 'object':
        df_try_feature[col] = pd.to_numeric(df_try_feature[col])

test_use = df_try_feature
missing_cols = set(X_traint.columns) - set(test_use.columns)

for c in missing_cols:
    test_use[c] = 0

# Ensure the order of column in the test set is in the same order than in train set
test_use = test_use[X_traint.columns]


test_pred = voting_classifier.predict(test_use)


  df[column] = df[column].fillna(method='ffill')
  df[column] = df[column].fillna(method='ffill')
  df[column] = df[column].fillna(method='ffill')
  df[column] = df[column].fillna(method='ffill')
  df[column] = df[column].fillna(method='ffill')
  amenities_median = df.groupby('AgeBin')[amenities].median()
  test_use[c] = 0


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 38 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   PassengerId                 4277 non-null   object 
 1   HomePlanet                  4277 non-null   object 
 2   CryoSleep                   4277 non-null   object 
 3   Cabin                       4177 non-null   object 
 4   Destination                 4277 non-null   object 
 5   Age                         4277 non-null   float64
 6   VIP                         4277 non-null   bool   
 7   RoomService                 4277 non-null   float64
 8   FoodCourt                   4277 non-null   float64
 9   ShoppingMall                4277 non-null   float64
 10  Spa                         4277 non-null   float64
 11  VRDeck                      4277 non-null   float64
 12  Name                        4144 non-null   object 
 13  group_num                   4277 

In [41]:
df_test2 = pd.read_csv('test.csv')
df_test2.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [42]:
df_test2 = pd.read_csv('test.csv')
ans = df_test2['PassengerId']
ans_df = pd.DataFrame({
    'PassengerId': ans
})
ans_df["Transported"] = test_pred
ans_df['Transported'] = ans_df['Transported'].replace({1: 'True', 0: 'False'})
ans_df

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [43]:
ans_df.to_csv('welovemrchow.csv', index=False)