In [18]:
import pandas as pd
df = pd.read_csv('ST_train_feature_final.csv')
#df = pd.read_csv('ST_test_feature_final.csv')

**LogisticRegression**

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X = df.drop('Transported', axis=1)
y = df['Transported']

# Defining numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(transformers=[('num', StandardScaler(), numerical_features),('cat', OneHotEncoder(), categorical_features)])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),('classifier', LogisticRegression())])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("LogisticRegression accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

LogisticRegression accuracy: 0.7832087406555491
              precision    recall  f1-score   support

       False       0.80      0.74      0.77       861
        True       0.77      0.82      0.79       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**DecisionTree**

In [4]:
from sklearn.tree import DecisionTreeClassifier
X = df.drop('Transported', axis=1)
y = df['Transported']

# Defining numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),('classifier', DecisionTreeClassifier(random_state=42))])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("DecisionTree accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

DecisionTree accuracy: 0.7521564117308798
              precision    recall  f1-score   support

       False       0.75      0.75      0.75       861
        True       0.75      0.76      0.75       878

    accuracy                           0.75      1739
   macro avg       0.75      0.75      0.75      1739
weighted avg       0.75      0.75      0.75      1739



**Random Forest**

In [5]:
from sklearn.ensemble import RandomForestClassifier
X = df.drop('Transported', axis=1)
y = df['Transported']

# Defining numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),('classifier', RandomForestClassifier(random_state=42))])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("RandomForest accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

RandomForest accuracy: 0.79700977573318
              precision    recall  f1-score   support

       False       0.78      0.82      0.80       861
        True       0.82      0.77      0.79       878

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739



**Gradient Boosting Machines (GBM)**

In [6]:
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.tree import DecisionTreeClassifier
X = df.drop('Transported', axis=1)
y = df['Transported']

# Defining numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),('classifier', GradientBoostingClassifier(random_state=42))])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Gradient Boosting Machine accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Gradient Boosting Machine accuracy: 0.7947096032202415
              precision    recall  f1-score   support

       False       0.81      0.76      0.79       861
        True       0.78      0.83      0.80       878

    accuracy                           0.79      1739
   macro avg       0.80      0.79      0.79      1739
weighted avg       0.80      0.79      0.79      1739



**Support Vector Machines (SVM)**

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
X = df.drop('Transported', axis=1)
y = df['Transported']

# Defining numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with an SVM classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', SVC(random_state=42))])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
print("Support Vector Machine accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Support Vector Machine accuracy: 0.7878090856814262
              precision    recall  f1-score   support

       False       0.79      0.77      0.78       861
        True       0.78      0.80      0.79       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739



**K-Nearest Neighbors (KNN)**

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X = df.drop('Transported', axis=1)
y = df['Transported']

# Defining numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with a KNN classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', KNeighborsClassifier())])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
print("K-Nearest Neighbors accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

K-Nearest Neighbors accuracy: 0.7699827487061529
              precision    recall  f1-score   support

       False       0.75      0.79      0.77       861
        True       0.79      0.75      0.77       878

    accuracy                           0.77      1739
   macro avg       0.77      0.77      0.77      1739
weighted avg       0.77      0.77      0.77      1739



**Naive Bayes**

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

X = df.drop('Transported', axis=1)
y = df['Transported']

# Defining numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with a Naive Bayes classifier
# Using GaussianNB for numerical features and CategoricalNB for categorical features
#pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                           ('classifier', GaussianNB())])  # Use CategoricalNB() if your features are primarily categorical

to_dense_transformer = FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)

# Create a pipeline with a Naive Bayes classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('to_dense', to_dense_transformer),
                           ('classifier', GaussianNB())])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
print("Gaussian Naive Bayes accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Gaussian Naive Bayes accuracy: 0.7596319723979299
              precision    recall  f1-score   support

       False       0.77      0.73      0.75       861
        True       0.75      0.79      0.77       878

    accuracy                           0.76      1739
   macro avg       0.76      0.76      0.76      1739
weighted avg       0.76      0.76      0.76      1739



**Feedforward Neural Networks (FNNs)**

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

X = df.drop('Transported', axis=1)
y = df['Transported']

# Defining numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Preprocessing the data
X_processed = preprocessor.fit_transform(X)
y_processed = to_categorical(y)  # Use this for classification. For regression, you don't need to_categorical

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.2, random_state=42)

# Neural network architecture
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(y_processed.shape[1], activation='softmax'))  # Use 'softmax' for classification, 'linear' for regression

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])  # Use 'mean_squared_error' for regression

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Feedforward Neural Networks accuracy: {accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Feedforward Neural Networks accuracy: 0.7826337218284607


**Stacking Ensemble Model**

In [11]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X = df.drop('Transported', axis=1)
y = df['Transported']

# Defining numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define base models
base_models = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42)),
    ('svc', SVC(probability=True, random_state=42))
]

# Define meta-learner
meta_learner = LogisticRegression()

# Create the stacking model
stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_learner, cv=5)

# Create a pipeline with the stacked model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', stacked_model)])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = pipeline.predict(X_test)
print("Stacked Model Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Stacked Model Accuracy: 0.7998849913743531
              precision    recall  f1-score   support

       False       0.81      0.77      0.79       861
        True       0.79      0.83      0.81       878

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739



In [17]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

# Load your DataFrame here
# df = pd.read_csv('your_data.csv')

# Define the selected features
selected_features = ['HomePlanet', 'CryoSleep', 'Age', 'VIP', 'RoomService', 
                     'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstNameInitial', 
                     'SurnameInitial', 'deck', 'num', 'Group', 'Group_size', 'All_Zero']

# Ensure that df contains the selected features
X = df[selected_features]
y = df['Transported']

# Defining numerical and categorical features within the selected features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define base models
base_models = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42)),
    ('svc', SVC(probability=True, random_state=42))
]

# Define meta-learner
meta_learner = LogisticRegression()

# Create the stacking model
stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_learner, cv=5)

# Create a pipeline with the stacked model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', stacked_model)])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = pipeline.predict(X_test)
print("Stacked Model Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Stacked Model Accuracy: 0.7941345600920069
              precision    recall  f1-score   support

       False       0.81      0.76      0.79       861
        True       0.78      0.82      0.80       878

    accuracy                           0.79      1739
   macro avg       0.80      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739



In [19]:
# Assuming df_new is your new DataFrame and it has the same structure as df
# Ensure that df_new contains the selected features


X_new = df[selected_features]

# Preprocess and predict using the trained model
y_pred_new = pipeline.predict(X_new)

# Import ST_test.csv
ST_test = pd.read_csv('ST_test.csv')

# Check if ST_test has the same number of rows as y_pred_new
if len(ST_test) == len(y_pred_new):
    # Append the predicted values to ST_test.csv
    ST_test['Transported'] = y_pred_new
    # Save the modified DataFrame to a new CSV file

    # Keep only the PassengerId and Transported columns
    ST_test_result = ST_test[['PassengerId', 'Transported']]

    # Save the modified DataFrame to a new CSV file
    ST_test_result.to_csv('ST_test_result2.csv', index=False)
else:
    print("Error: The length of ST_test and predicted values does not match.")