Data Loading and Exploration

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the training data
train_data = pd.read_csv('/train[1].csv')

# Display the first few rows of the dataframe
print(train_data.head())

# Check for missing values
print(train_data.isnull().sum())

# Basic statistics of the data
print(train_data.describe())


  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False  
4         True  
Pa

Data Preprocessing

In [None]:
# Fill missing values for CryoSleep and VIP with the mode (most frequent value)
train_data['CryoSleep'].fillna(train_data['CryoSleep'].mode()[0], inplace=True)
train_data['VIP'].fillna(train_data['VIP'].mode()[0], inplace=True)

# Fill missing values for Age with the median
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)

# Fill missing values for HomePlanet, Cabin, and Destination with the mode
train_data['HomePlanet'].fillna(train_data['HomePlanet'].mode()[0], inplace=True)
train_data['Cabin'].fillna(train_data['Cabin'].mode()[0], inplace=True)
train_data['Destination'].fillna(train_data['Destination'].mode()[0], inplace=True)

# Fill missing values for luxury spending with 0 (no spending)
train_data['RoomService'].fillna(0, inplace=True)
train_data['FoodCourt'].fillna(0, inplace=True)
train_data['ShoppingMall'].fillna(0, inplace=True)
train_data['Spa'].fillna(0, inplace=True)
train_data['VRDeck'].fillna(0, inplace=True)

# Drop the Name column as it is unlikely to be useful for prediction
train_data.drop(columns=['Name'], inplace=True)

# Encode categorical features
le = LabelEncoder()
for column in ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']:
    train_data[column] = le.fit_transform(train_data[column])

# Check the data types and make sure all features are numeric
print(train_data.dtypes)


PassengerId      object
HomePlanet        int64
CryoSleep         int64
Cabin             int64
Destination       int64
Age             float64
VIP               int64
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Transported        bool
dtype: object


Model Training

In [None]:
# Define features and target variable
X = train_data.drop(columns=['PassengerId', 'Transported'])
y = train_data['Transported']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Train a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = clf.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')
print(classification_report(y_val, y_pred))


Validation Accuracy: 0.772857964347326
              precision    recall  f1-score   support

       False       0.75      0.80      0.78       861
        True       0.79      0.74      0.77       878

    accuracy                           0.77      1739
   macro avg       0.77      0.77      0.77      1739
weighted avg       0.77      0.77      0.77      1739



Prediction on Test Set

In [None]:
# Load the test data
test_data = pd.read_csv('/test[1].csv')

# Preprocess the test data similarly
test_data['CryoSleep'].fillna(test_data['CryoSleep'].mode()[0], inplace=True)
test_data['VIP'].fillna(test_data['VIP'].mode()[0], inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['HomePlanet'].fillna(test_data['HomePlanet'].mode()[0], inplace=True)
test_data['Cabin'].fillna(test_data['Cabin'].mode()[0], inplace=True)
test_data['Destination'].fillna(test_data['Destination'].mode()[0], inplace=True)
test_data['RoomService'].fillna(0, inplace=True)
test_data['FoodCourt'].fillna(0, inplace=True)
test_data['ShoppingMall'].fillna(0, inplace=True)
test_data['Spa'].fillna(0, inplace=True)
test_data['VRDeck'].fillna(0, inplace=True)

test_data.drop(columns=['Name'], inplace=True)

for column in ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']:
    test_data[column] = le.fit_transform(test_data[column])

# Define features
X_test = test_data.drop(columns=['PassengerId'])

# Standardize the features
X_test = scaler.transform(X_test)

# Make predictions
test_predictions = clf.predict(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': test_predictions
})

submission.to_csv('submission.csv', index=False)


In [None]:
Printing the submission file

SyntaxError: invalid syntax (<ipython-input-22-2644d26f5221>, line 1)

In [None]:
# Load the test data
test_data = pd.read_csv('/test[1].csv')

# Preprocess the test data similarly
test_data['CryoSleep'].fillna(test_data['CryoSleep'].mode()[0], inplace=True)
test_data['VIP'].fillna(test_data['VIP'].mode()[0], inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['HomePlanet'].fillna(test_data['HomePlanet'].mode()[0], inplace=True)
test_data['Cabin'].fillna(test_data['Cabin'].mode()[0], inplace=True)
test_data['Destination'].fillna(test_data['Destination'].mode()[0], inplace=True)
test_data['RoomService'].fillna(0, inplace=True)
test_data['FoodCourt'].fillna(0, inplace=True)
test_data['ShoppingMall'].fillna(0, inplace=True)
test_data['Spa'].fillna(0, inplace=True)
test_data['VRDeck'].fillna(0, inplace=True)

test_data.drop(columns=['Name'], inplace=True)

for column in ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']:
    test_data[column] = le.fit_transform(test_data[column])

# Define features
X_test = test_data.drop(columns=['PassengerId'])

# Standardize the features
X_test = scaler.transform(X_test)

# Make predictions
test_predictions = clf.predict(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': test_predictions
})

submission.to_csv('submission.csv', index=False)
print(submission.head())


  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True


Downloading

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Load the test data
test_data = pd.read_csv('/test[1].csv')

# Preprocess the test data similarly
test_data['CryoSleep'].fillna(test_data['CryoSleep'].mode()[0], inplace=True)
test_data['VIP'].fillna(test_data['VIP'].mode()[0], inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['HomePlanet'].fillna(test_data['HomePlanet'].mode()[0], inplace=True)
test_data['Cabin'].fillna(test_data['Cabin'].mode()[0], inplace=True)
test_data['Destination'].fillna(test_data['Destination'].mode()[0], inplace=True)
test_data['RoomService'].fillna(0, inplace=True)
test_data['FoodCourt'].fillna(0, inplace=True)
test_data['ShoppingMall'].fillna(0, inplace=True)
test_data['Spa'].fillna(0, inplace=True)
test_data['VRDeck'].fillna(0, inplace=True)

test_data.drop(columns=['Name'], inplace=True)

le = LabelEncoder()
for column in ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']:
    test_data[column] = le.fit_transform(test_data[column])

# Define features
X_test = test_data.drop(columns=['PassengerId'])

# Standardize the features
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)

# Assuming clf is already trained RandomForestClassifier
# Uncomment the next line and comment the fitting part if the clf is already trained
# clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf.fit(X_test, test_data['Transported'])  # Replace with actual training

# Make predictions
test_predictions = clf.predict(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': test_predictions
})

# Create the directory if it does not exist
output_dir = '/mnt/data'
os.makedirs(output_dir, exist_ok=True)

# Save the submission file
submission_file_path = os.path.join(output_dir, 'submission.csv')
submission.to_csv(submission_file_path, index=False)

# Print the test predictions
print(submission.head())

# Provide the link to download the submission file
submission_file_path



  PassengerId  Transported
0     0013_01        False
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True


'/mnt/data/submission.csv'

Downloading to Downlads file

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Load the test data
test_data = pd.read_csv('/test[1].csv')

# Preprocess the test data similarly
test_data['CryoSleep'].fillna(test_data['CryoSleep'].mode()[0], inplace=True)
test_data['VIP'].fillna(test_data['VIP'].mode()[0], inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['HomePlanet'].fillna(test_data['HomePlanet'].mode()[0], inplace=True)
test_data['Cabin'].fillna(test_data['Cabin'].mode()[0], inplace=True)
test_data['Destination'].fillna(test_data['Destination'].mode()[0], inplace=True)
test_data['RoomService'].fillna(0, inplace=True)
test_data['FoodCourt'].fillna(0, inplace=True)
test_data['ShoppingMall'].fillna(0, inplace=True)
test_data['Spa'].fillna(0, inplace=True)
test_data['VRDeck'].fillna(0, inplace=True)

test_data.drop(columns=['Name'], inplace=True)

le = LabelEncoder()
for column in ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']:
    test_data[column] = le.fit_transform(test_data[column])

# Define features
X_test = test_data.drop(columns=['PassengerId'])

# Standardize the features
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)

# Assuming clf is already trained RandomForestClassifier
# Uncomment the next line and comment the fitting part if the clf is already trained
# clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf.fit(X_test, test_data['Transported'])  # Replace with actual training

# Make predictions
test_predictions = clf.predict(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': test_predictions
})

# Save the submission file to the desired location
submission_file_path = r'C:\Users\neogi\Downloads\submission.csv'
submission.to_csv(submission_file_path, index=False)

# Print the test predictions
print(submission.head())


FileNotFoundError: [Errno 2] No such file or directory: '/test[1].csv'