In [102]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

In [88]:
# Load the dataset
url = "https://pages.mtu.edu/~cai/sat5165/titanic_data.csv"
titanic_data = pd.read_csv(url)

# Display the first few rows of the dataset
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [89]:
titanic_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [90]:
# Separate features and target variable
X = titanic_data.drop(columns=['Survived'])
y = titanic_data['Survived']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numeric_features = ['Age', 'Fare']
categorical_features = ['Pclass', 'Sex', 'Embarked']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the training data
X_train_processed = preprocessor.fit_transform(X_train)

# Preprocess the testing data
X_test_processed = preprocessor.transform(X_test)

In [91]:
# Initialize classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(3, 3), random_state=42)
}

# Train and evaluate each classifier
results = {}
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train_processed, y_train)

    # Predict on the test set
    y_pred = clf.predict(X_test_processed)

    # Calculate accuracy and F1 score
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Store results
    results[name] = {"Accuracy": accuracy, "F1 Score": f1}

# Print results
print("Results on the test set:")
for name, scores in results.items():
    print(f"{name}:")
    print(f"  Accuracy: {scores['Accuracy']:.4f}")
    print(f"  F1 Score: {scores['F1 Score']:.4f}")
    print()

Results on the test set:
Logistic Regression:
  Accuracy: 0.7933
  F1 Score: 0.7448

Naive Bayes:
  Accuracy: 0.7654
  F1 Score: 0.7342

Random Forest:
  Accuracy: 0.7821
  F1 Score: 0.7417

SVM:
  Accuracy: 0.8045
  F1 Score: 0.7368

K-Nearest Neighbors:
  Accuracy: 0.7877
  F1 Score: 0.7397

Neural Network:
  Accuracy: 0.8045
  F1 Score: 0.7482





In [106]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import (
    StringIndexer,
    VectorAssembler,
    Imputer,
    StandardScaler
)
from pyspark.ml import Pipeline
from pyspark.ml.classification import (
    LogisticRegression,
    NaiveBayes,
    RandomForestClassifier,
    GBTClassifier,
    DecisionTreeClassifier,
    MultilayerPerceptronClassifier
)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Titanic Spark ML") \
    .getOrCreate()

# Load the dataset
file_path = "/content/titanic_data.csv"
titanic_df = spark.read.csv(file_path, header=True, inferSchema=True)

# Display the first few rows of the dataset
titanic_df.show()

# Separate features and target variable
feature_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target_col = 'Survived'

# Preprocessing pipeline
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid="keep") for col in ['Sex', 'Embarked']]
numeric_cols = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_cols = ['Pclass', 'Sex_index', 'Embarked_index']
input_cols = numeric_cols + categorical_cols

imputer = Imputer(inputCols=numeric_cols, outputCols=[f"{col}_imputed" for col in numeric_cols])
assembler = VectorAssembler(inputCols=[f"{col}_imputed" for col in numeric_cols] + categorical_cols, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

# Define the target column
target_col = 'Survived'

# Evaluate function to calculate F1 score
def calculate_f1(predictions):
    true_positive = predictions.filter((col(target_col) == 1) & (col('prediction') == 1)).count()
    false_positive = predictions.filter((col(target_col) == 0) & (col('prediction') == 1)).count()
    false_negative = predictions.filter((col(target_col) == 1) & (col('prediction') == 0)).count()

    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)

    f1 = 2 * (precision * recall) / (precision + recall)

    return f1

# Define classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(labelCol=target_col),
    "Naive Bayes": NaiveBayes(labelCol=target_col),
    "Random Forest": RandomForestClassifier(labelCol=target_col),
    "SVM Classifier": GBTClassifier(labelCol=target_col),
    "Decision Tree": DecisionTreeClassifier(labelCol=target_col),
    "Multilayer Perceptron": MultilayerPerceptronClassifier(labelCol=target_col, featuresCol="scaled_features", layers=[len(input_cols), 10, 2, 2], seed=42)
}

# Train and evaluate each classifier
results = {}
for name, clf in classifiers.items():
    pipeline = Pipeline(stages=indexers + [imputer, assembler, scaler, clf])
    model = pipeline.fit(titanic_df)
    predictions = model.transform(titanic_df)

    # Calculate accuracy
    evaluator = MulticlassClassificationEvaluator(labelCol=target_col, predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    # Calculate F1 score
    f1 = calculate_f1(predictions)

    results[name] = {"Accuracy": accuracy, "F1 Score": f1}

# Print results
print("Results on the test set:")
for name, scores in results.items():
    print(f"{name}:")
    print(f"  Accuracy: {scores['Accuracy']:.4f}")
    print(f"  F1 Score: {scores['F1 Score']:.4f}")
    print()

# Stop SparkSession
spark.stop()


+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|      