In [46]:
import pandas as pd
df = pd.read_csv('ST_train_feature_final.csv')
#df = pd.read_csv('ST_test_feature_final.csv')

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the selected features
selected_features = ['HomePlanet', 'CryoSleep', 'Age', 'VIP', 'RoomService', 
                     'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstNameInitial', 
                     'SurnameInitial', 'deck', 'num', 'Group', 'Group_size', 'All_Zero']

# Ensure that df contains the selected features
X = df[selected_features]
y = df['Transported']

# Defining numerical and categorical features within the selected features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with a RandomForest classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = pipeline.predict(X_test)
print("RandomForest accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

RandomForest accuracy: 0.8050603795284647
              precision    recall  f1-score   support

       False       0.79      0.82      0.81       861
        True       0.82      0.79      0.80       878

    accuracy                           0.81      1739
   macro avg       0.81      0.81      0.81      1739
weighted avg       0.81      0.81      0.81      1739



In [48]:
# Assuming df_new is your new DataFrame and it has the same structure as df
# Ensure that df_new contains the selected features
X_new = df[selected_features]

# Preprocess and predict using the trained model
y_pred_new = pipeline.predict(X_new)

# Import ST_test.csv
ST_test = pd.read_csv('ST_test.csv')

# Check if ST_test has the same number of rows as y_pred_new
if len(ST_test) == len(y_pred_new):
    # Append the predicted values to ST_test.csv
    ST_test['Transported'] = y_pred_new
    # Save the modified DataFrame to a new CSV file

    # Keep only the PassengerId and Transported columns
    ST_test_result = ST_test[['PassengerId', 'Transported']]

    # Save the modified DataFrame to a new CSV file
    ST_test_result.to_csv('ST_test_result.csv', index=False)
else:
    print("Error: The length of ST_test and predicted values does not match.")

In [3]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for RandomForest
param_grid = {
    'classifier__n_estimators': [100, 200, 300],  # Number of trees in the forest
    'classifier__max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'classifier__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'classifier__min_samples_leaf': [1, 2, 4]     # Minimum number of samples required at each leaf node
}

# Create a GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Predict and evaluate the model with best parameters
best_pipeline = grid_search.best_estimator_
y_pred = best_pipeline.predict(X_test)
print("RandomForest accuracy with best parameters:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters: {'classifier__max_depth': 20, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 300}
Best score: 0.80198563219877
RandomForest accuracy with best parameters: 0.79700977573318
              precision    recall  f1-score   support

       False       0.79      0.80      0.80       861
        True       0.80      0.79      0.80       878

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739



In [40]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

# Define the selected features
selected_features = ['HomePlanet', 'CryoSleep', 'Age', 'VIP', 'RoomService', 
                     'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstNameInitial', 
                     'SurnameInitial', 'deck', 'num', 'Group', 'Group_size', 'All_Zero']

# Ensure that df contains the selected features
X = df[selected_features]
y = df['Transported']

# Defining numerical and categorical features within the selected features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with a RandomForest classifier with adjusted parameters
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(n_estimators=200, 
                                                                 criterion='entropy', 
                                                                 max_depth=40, 
                                                                 min_samples_split=10, 
                                                                 min_samples_leaf=1, 
                                                                 random_state=42))])

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = pipeline.predict(X_test)
print("RandomForest accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Extract feature importances
feature_importances = pipeline.named_steps['classifier'].feature_importances_

# Get feature names after preprocessing
feature_names_transformed = list(numerical_features) + \
                            list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))

# Create a DataFrame for easier visualization
importances_df = pd.DataFrame({'Feature': feature_names_transformed, 'Importance': feature_importances})
importances_df = importances_df.sort_values(by='Importance', ascending=False)

print(importances_df.head(25))

RandomForest accuracy: 0.8016101207590569
              precision    recall  f1-score   support

       False       0.79      0.81      0.80       861
        True       0.81      0.79      0.80       878

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739

               Feature  Importance
4                  Spa    0.077655
5               VRDeck    0.066218
2            FoodCourt    0.065842
6                  num    0.064089
7                Group    0.062744
3         ShoppingMall    0.055470
0                  Age    0.054044
1          RoomService    0.053543
73      All_Zero_False    0.052953
74       All_Zero_True    0.048293
12     CryoSleep_False    0.039151
13      CryoSleep_True    0.036692
9     HomePlanet_Earth    0.023844
8           Group_size    0.020587
10   HomePlanet_Europa    0.018443
71              deck_G    0.016363
70              deck_F    0.013918
6