In [1]:
! pip install imblearn

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple

In [6]:
from IPython.testing import test
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

class DatasetName:
    data: pd.DataFrame

    # do not call without fabric method
    def __init__(self, data: pd.DataFrame):
        self.data = data

    @classmethod
    def load(cls) :
        """
        loads datasets with given name
        :return: (train_dataframe, test_dataframe)
        """
        data = pd.read_csv("data/spaceship_titanic_train.csv")
        return DatasetName(data)

    def prepare(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """
        does preprocessing with dataset, feature generation, etc.
        :return: (train_x, train_y, test_x, test_y)
        """

        data = self.data
        data["HomePlanet"].fillna('Unknown', inplace=True)
        
        y = data["Transported"]
        X = data.drop(["Transported", "Name"], axis=1)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3,random_state=1729,
                                                            stratify=y)

        # filling the null values with median 
        imputer_cols = ["Age", "FoodCourt", "ShoppingMall", "Spa", "VRDeck" ,"RoomService"]
        imputer = SimpleImputer(strategy="median")
        imputer.fit(X_train[imputer_cols])
        X_train[imputer_cols] = imputer.transform(X_train[imputer_cols])
        X_test[imputer_cols] = imputer.transform(X_test[imputer_cols])


        # label encoding 
        label_cols = ["HomePlanet", "CryoSleep","Cabin", "Destination" ,"VIP"]
        def label_encoder(train,test,columns):
            for col in columns:
                train[col] = train[col].astype(str)
                test[col] = test[col].astype(str)
                train[col] = LabelEncoder().fit_transform(train[col])
                test[col] =  LabelEncoder().fit_transform(test[col])
            return train, test

        X_train, X_test = label_encoder(X_train, X_test, label_cols)

        X_train["PassengerId"] = LabelEncoder().fit_transform(X_train["PassengerId"])
        X_test["PassengerId"]= LabelEncoder().fit_transform(X_test["PassengerId"])

        return X_train, y_train, X_test, y_test

In [5]:
(train_x, train_y, test_x, test_y) = DatasetName.load().prepare()

In [19]:
print(test_y)

4871    False
2621    False
5545     True
2619     True
4378    False
        ...  
270      True
557      True
2113     True
7452    False
4506    False
Name: Transported, Length: 1739, dtype: bool


In [8]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=10, criterion="gini", max_depth=7)
random_forest.fit(train_x, train_y)
y_pred = random_forest.predict(test_x)

In [9]:
# для проверки точности необходимо отправить файл submission.csv в систему kaggle:
# https://www.kaggle.com/competitions/spaceship-titanic/data

X_test_id =  pd.read_csv("spaceship_titanic/sample_submission.csv")

submission = pd.DataFrame(
    {'PassengerId':X_test_id["PassengerId"] ,
     'Transported': y_pred},columns=['PassengerId', 'Transported'])

submission.to_csv("submission.csv",index=False)