In [1]:
import pandas as pd
df = pd.read_csv('ST_train_feature_final.csv')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the features you want to include
selected_features = ['All_Zero']

# Select only the defined features and target variable
X = df[selected_features]
y = df['Transported']

# Defining numerical and categorical features within the selected features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with a RandomForest classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = pipeline.predict(X_test)
print('Features:', selected_features)
print("Accuracy:", accuracy_score(y_test, y_pred))
#print(classification_report(y_test, y_pred))

Features: ['All_Zero']
Accuracy: 0.7406555491661875


In [4]:
import pandas as pd
import itertools

def generate_combinations(features, k):
    return list(itertools.combinations(features, k))

# List of features
features = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'num', 'Group', 'Group_size', 'All_Zero', 'Total_Spend']

# Generate all combinations of the features where k = 16
combinations = generate_combinations(features, 16)

# Create a DataFrame from the combinations
combinations_df = pd.DataFrame(combinations, columns=[f'Feature_{i+1}' for i in range(16)])

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming df is your DataFrame containing different sets of features
# Each row in df represents a different combination of features

for index, row in combinations_df.iterrows():
    # Read selected_features for the current iteration
    selected_features = row.dropna().tolist()

    # Select only the defined features and target variable
    X = df[selected_features]
    y = df['Transported']

    # Defining numerical and categorical features within the selected features
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object', 'bool']).columns

    # Creating a column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(), categorical_features)
        ])

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create a pipeline with a RandomForest classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', RandomForestClassifier(random_state=42))])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Predict and evaluate the model
    y_pred = pipeline.predict(X_test)
    print('Features:', selected_features)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    #print(classification_report(y_test, y_pred))

Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'num', 'Group', 'Group_size']
Accuracy: 0.7924094307073031
Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'num', 'Group', 'All_Zero']
Accuracy: 0.7912593444508338
Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'num', 'Group', 'Total_Spend']
Accuracy: 0.7929844738355377
Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'num', 'Group_size', 'All_Zero']
Accuracy: 0.7924094307073031
Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP',

In [8]:
best_combinations_df = pd.read_csv('best_feature_set.csv', header=None)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming df is your DataFrame containing different sets of features
# Each row in df represents a different combination of features

for index, row in best_combinations_df.iterrows():
    # Read selected_features for the current iteration
    selected_features = row.dropna().tolist()

    # Select only the defined features and target variable
    X = df[selected_features]
    y = df['Transported']

    # Defining numerical and categorical features within the selected features
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object', 'bool']).columns

    # Creating a column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(), categorical_features)
        ])

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create a pipeline with a RandomForest classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', RandomForestClassifier(random_state=42))])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Predict and evaluate the model
    y_pred = pipeline.predict(X_test)
    print('Features:', selected_features)
    print("Accuracy:", accuracy_score(y_test, y_pred))

Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'Group', 'Group_size', 'All_Zero', 'Total_Spend']
Accuracy: 0.7958596894767107
Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'num', 'Group', 'Group_size', 'All_Zero', 'Total_Spend']
Accuracy: 0.7958596894767107
Features: ['HomePlanet', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'num', 'Group_size', 'All_Zero', 'Total_Spend']
Accuracy: 0.7958596894767107
Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck', 'SurnameInitial', 'deck', 'num', 'Group', 'Group_size', 'All_Zero', 'Total_Spend']
Accuracy: 0.7964347326049454
Features: ['HomePlanet', 'Destination', 'Age', 'VIP', 'RoomServi

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier  # Import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming df is your DataFrame containing different sets of features
# Each row in df represents a different combination of features

for index, row in best_combinations_df.iterrows():
    # Read selected_features for the current iteration
    selected_features = row.dropna().tolist()

    # Select only the defined features and target variable
    X = df[selected_features]
    y = df['Transported']

    # Defining numerical and categorical features within the selected features
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object', 'bool']).columns

    # Creating a column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(), categorical_features)
        ])

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create a pipeline with a Decision Tree classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', DecisionTreeClassifier(random_state=42))])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Predict and evaluate the model
    y_pred = pipeline.predict(X_test)
    print('Features:', selected_features)
    print("Accuracy:", accuracy_score(y_test, y_pred))

Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'Group', 'Group_size', 'All_Zero', 'Total_Spend']
Accuracy: 0.7481311098332375
Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'num', 'Group', 'Group_size', 'All_Zero', 'Total_Spend']
Accuracy: 0.7360552041403106
Features: ['HomePlanet', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'num', 'Group_size', 'All_Zero', 'Total_Spend']
Accuracy: 0.7418056354226567
Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck', 'SurnameInitial', 'deck', 'num', 'Group', 'Group_size', 'All_Zero', 'Total_Spend']
Accuracy: 0.7395054629097182
Features: ['HomePlanet', 'Destination', 'Age', 'VIP', 'RoomServi

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier  # Import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming df is your DataFrame containing different sets of features
# Each row in df represents a different combination of features

for index, row in best_combinations_df.iterrows():
    # Read selected_features for the current iteration
    selected_features = row.dropna().tolist()

    # Select only the defined features and target variable
    X = df[selected_features]
    y = df['Transported']

    # Defining numerical and categorical features within the selected features
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object', 'bool']).columns

    # Creating a column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(), categorical_features)
        ])

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create a pipeline with a Gradient Boosting classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', GradientBoostingClassifier(random_state=42))])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Predict and evaluate the model
    y_pred = pipeline.predict(X_test)
    print('Features:', selected_features)
    print("Accuracy:", accuracy_score(y_test, y_pred))

Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'Group', 'Group_size', 'All_Zero', 'Total_Spend']
Accuracy: 0.7814836112708453
Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'num', 'Group', 'Group_size', 'All_Zero', 'Total_Spend']
Accuracy: 0.7843588269120184
Features: ['HomePlanet', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'num', 'Group_size', 'All_Zero', 'Total_Spend']
Accuracy: 0.7889591719378953
Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck', 'SurnameInitial', 'deck', 'num', 'Group', 'Group_size', 'All_Zero', 'Total_Spend']
Accuracy: 0.7878090856814262
Features: ['HomePlanet', 'Destination', 'Age', 'VIP', 'RoomServi

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC  # Import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming df is your DataFrame containing different sets of features
# Each row in df represents a different combination of features

for index, row in best_combinations_df.iterrows():
    # Read selected_features for the current iteration
    selected_features = row.dropna().tolist()

    # Select only the defined features and target variable
    X = df[selected_features]
    y = df['Transported']

    # Defining numerical and categorical features within the selected features
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object', 'bool']).columns

    # Creating a column transformer for preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(), categorical_features)
        ])

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create a pipeline with an SVM classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', SVC(random_state=42))])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Predict and evaluate the model
    y_pred = pipeline.predict(X_test)
    print('Features:', selected_features)
    print("Accuracy:", accuracy_score(y_test, y_pred))

Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'Group', 'Group_size', 'All_Zero', 'Total_Spend']
Accuracy: 0.7849338700402531
Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'num', 'Group', 'Group_size', 'All_Zero', 'Total_Spend']
Accuracy: 0.7889591719378953
Features: ['HomePlanet', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstNameInitial', 'SurnameInitial', 'deck', 'num', 'Group_size', 'All_Zero', 'Total_Spend']
Accuracy: 0.780333525014376
Features: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck', 'SurnameInitial', 'deck', 'num', 'Group', 'Group_size', 'All_Zero', 'Total_Spend']
Accuracy: 0.78205865439908
Features: ['HomePlanet', 'Destination', 'Age', 'VIP', 'RoomService'