# Run Misceleanous Functions

This notebook was used to execute misceleanous commands such as formatting data when needed.

### Define Preprocess Data Function

In [1]:
# Load required libraries

import csv
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
def preprocess(file, csv_name):
    
    # Load the data
    df = pd.read_csv(os.path.join(r'../data', file))

    # Drop rows with null values
    df = df.dropna()

    # Split Cabin into Deck, Num and Side
    df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)

    # Drop Columns for PassengerId, Cabin, Name
    df = df.drop(columns=['PassengerId', 'Cabin', 'Name'])

    # Creating labelEncoder
    le = LabelEncoder()

    # Converting string labels into numbers.
    df['HomePlanet'] = le.fit_transform(df['HomePlanet'])
    df['Destination'] = le.fit_transform(df['Destination'])
    df['Deck'] = le.fit_transform(df['Deck'])
    df['Side'] = le.fit_transform(df['Side'])

    df.to_csv(r'../data/'+str(csv_name))

    print('PREPROCESSING COMPLETE | File saved as ' + csv_name)

In [3]:
def preprocess_test(file, csv_name):
    
    # Load test data
    df = pd.read_csv(r'../data/test.csv')

    # Split Cabin into Deck, Num and Side
    df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)

    # Drop Columns for PassengerId, Cabin, Name
    df = df.drop(columns=['Cabin', 'Name'])

    # creating instance of labelencoder
    le = LabelEncoder()

    # Creating labelEncoder
    le = LabelEncoder()

    # Converting string labels into numbers.
    df['HomePlanet'] = le.fit_transform(df['HomePlanet'])
    df['Destination'] = le.fit_transform(df['Destination'])
    df['Deck'] = le.fit_transform(df['Deck'])
    df['Side'] = le.fit_transform(df['Side'])

    df.to_csv(r'../data/'+str(csv_name))

    print('PREPROCESSING COMPLETE | File saved as ' + csv_name)

### Preprocess training data

In [4]:
# Load the data
train_df = pd.read_csv(r'../data/train.csv')

train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
preprocess('train.csv', 'train_clean.csv')

PREPROCESSING COMPLETE | File saved as train_clean.csv


### Preprocess test data

In [6]:
# Load the data
test_df = pd.read_csv(r'../data/train.csv')

test_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [7]:
preprocess_test('test.csv', 'test_clean.csv')

PREPROCESSING COMPLETE | File saved as test_clean.csv
