In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

from imblearn.over_sampling import RandomOverSampler

import copy

import seaborn as sns


df = pd.read_csv("dataset/train.csv")
print("Original columns:", df.columns)

dataset_cols = [
  "PassengerId",  # Ensure all columns are included
  "CryoSleep",
  "HomePlanet",
  "Cabin",
  "Destination",
  "Age",
  "VIP",
  "RoomService",
  "FoodCourt",
  "ShoppingMall",
  "Spa",
  "VRDeck",
  "Transported"
]

# Ensure the length of dataset_cols matches the number of columns in df
if len(df.columns) == len(dataset_cols):
    df.columns = dataset_cols

else:
    print(f"Length mismatch: DataFrame has {len(df.columns)} columns, but dataset_cols has {len(dataset_cols)} names.")

print("Unique Home Planets:", df['HomePlanet'].unique())
planet_map = {"Europa": 1, "Earth": 2, "Mars": 3, np.nan: -1}  # Include np.nan mapping
df["HomePlanet"] = df["HomePlanet"].map(planet_map).fillna(-1).astype(int)  # home planet to int
df.head()


Original columns: Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')
Length mismatch: DataFrame has 14 columns, but dataset_cols has 13 names.
Unique Home Planets: ['Europa' 'Earth' 'Mars' nan]


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,1,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,2,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,1,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,1,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,2,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
average_age = df['Age'].mean()

df['Age'] = df['Age'].fillna(average_age)

print("Null values in 'Age':", df['Age'].isnull().sum())

Null values in 'Age': 0


In [4]:
average_room = df['RoomService'].mean()

df['RoomService'] = df['RoomService'].fillna(average_room)

print("Null values in 'Room Service':", df['RoomService'].isnull().sum())

Null values in 'Room Service': 0


In [6]:
df = df.dropna(subset=['VIP'])

In [7]:
df.head(200)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,1,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,2,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,1,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,1,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,2,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,0220_03,2,True,G/37/P,TRAPPIST-1e,13.0,False,0.0,0.0,0.0,0.0,0.0,Branca Wilsoney,False
199,0220_04,2,False,E/10/P,TRAPPIST-1e,25.0,False,122.0,84.0,,0.0,0.0,Dont Wilsoney,False
200,0220_05,2,False,F/48/P,TRAPPIST-1e,16.0,False,0.0,3.0,0.0,0.0,1099.0,Velyne Wilsoney,False
201,0220_06,2,False,G/37/P,55 Cancri e,1.0,False,0.0,0.0,0.0,0.0,0.0,Weney Wilsoney,False
