In [1]:
import numpy as np 
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
#This function cleans the data. trans=True is only for the training data, False for test data.

def clean_training_set(dataframe, trans = True): 
    label_dict = {True:1, False:0}
    planet_dict = {"Earth": 0, "Mars":1, "Europa":2}
    destination_dict = {"TRAPPIST-1e":0, "PSO J318.5-22":1,
                        "55 Cancri e":2}
    
    train_df = pd.read_csv(dataframe)
    train_df["CryoSleep_Label"] = train_df["CryoSleep"].map(label_dict)
    train_df["VIP_Label"] = train_df["VIP"].map(label_dict)
    train_df["HomePlanet_Label"] = train_df["HomePlanet"].map(planet_dict)
    train_df["Destination_Label"] = train_df["Destination"].map(planet_dict)
    
    #Fill NaN values with the column mean 
    train_df["Age"].fillna(value=train_df["Age"].mean(), inplace=True)
    train_df["RoomService"].fillna(value=train_df["RoomService"].mean(), inplace=True)
    train_df["FoodCourt"].fillna(value=train_df["FoodCourt"].mean(), inplace=True)
    train_df["ShoppingMall"].fillna(value=train_df["ShoppingMall"].mean(), inplace=True)
    train_df["Spa"].fillna(value=train_df["Spa"].mean(), inplace=True)
    train_df["VRDeck"].fillna(value=train_df["VRDeck"].mean(), inplace=True)
    #train_df["Transported_Label"].fillna(value=train_df["Transported_Label"].mean(), inplace=True)
    train_df["CryoSleep_Label"].fillna(value=train_df["CryoSleep_Label"].mean(), inplace=True)
    train_df["VIP_Label"].fillna(value=train_df["VIP_Label"].mean(), inplace=True)
    train_df["HomePlanet_Label"].fillna(value=train_df["HomePlanet_Label"].mean(), inplace=True)
    train_df["Destination_Label"].fillna(value=train_df["Destination_Label"].mean(), inplace=True)
    
    if trans == True:
        train_df["Transported_Label"] = train_df["Transported"].map(label_dict)
    else: 
        pass

    if trans == True:
        data_df = train_df[["Age","HomePlanet_Label","RoomService","FoodCourt",
                        "ShoppingMall", "Spa", "VRDeck", "Transported_Label",
                        "CryoSleep_Label", "VIP_Label"]]
    else:
        data_df = train_df[["Age","HomePlanet_Label","RoomService","FoodCourt",
                        "ShoppingMall", "Spa", "VRDeck",
                        "CryoSleep_Label", "VIP_Label"]]

        
        
    return data_df

In [3]:
data_df = clean_training_set("train.csv")

In [4]:
X = data_df.drop(["Transported_Label"], axis=1).fillna(value=0)
y = data_df["Transported_Label"]

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, 
                                                        random_state=42)

In [83]:
model = svm.SVC(random_state=1)

In [84]:
model.fit(X_train, y_train)

In [85]:
model.score(X_train, y_train)*100

79.16187345932622

In [71]:
y_prediction = model.predict(X_test)
print(y_prediction)

[0 1 1 ... 0 1 1]


In [72]:
results = accuracy_score(y_test, y_prediction)

In [73]:
print(results*100)

77.87576687116564


In [77]:
final_submission = clean_training_set("test.csv", trans=False)
final_results = model.predict(final_submission.fillna(value=0))
print(final_results)

[1 0 1 ... 1 1 1]


In [16]:
final_df = pd.read_csv("test.csv")
final_df = final_df['PassengerId']
Transported = pd.DataFrame(final_results)
final = [final_df, Transported]
final = pd.concat([final_df, Transported], axis=1, join="outer")
final.columns = ["PassengerId", "Transported"]
label_dict = {0:"False", 1:"True"}
final["Transported"] = final["Transported"].map(label_dict)
final
#final.to_csv("final_submission_SVM.csv", index=False)