In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data['Group'] = train_data['PassengerId'].str[:4].astype(float)
train_data['PeopleId'] = train_data['PassengerId'].str[-2:].astype(float)

test_data['Group'] = test_data['PassengerId'].str[:4].astype(float)
test_data['PeopleId'] = test_data['PassengerId'].str[-2:].astype(float)

train_data = train_data.drop('PassengerId', axis=1)
test_data = test_data.drop('PassengerId', axis=1)

In [None]:
train_data['Data'] = "train"
test_data['Data'] = "test"

all_data = pd.concat([train_data,test_data], axis=0)

all_data.replace({False: 0, True: 1}, inplace=True)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x = pd.DataFrame(scaler.fit_transform(all_data), columns=['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck'])

In [None]:
def my_analysis(dataset):
    df = pd.DataFrame()
    length = len(dataset.index)
    for col in dataset.columns:
        row_df = pd.DataFrame({'Datatype':dataset[col].dtype,
                                'NA values %':round(dataset[col].isna().sum()*100/length,2),
                                'Unique values':dataset[col].nunique(),
                                'frequent value':dataset[col].value_counts().index[0],
                                'contribution %':round(dataset[col].value_counts().iloc[0]*100/length,2),
                                'min value':'none' if dataset[col].dtype=='object' else dataset[col].min(),
                                'max value':'none' if dataset[col].dtype=='object' else dataset[col].max()
                                },
                                index=[col])
        df = pd.concat([df,row_df])
    return df

In [None]:
my_analysis(all_data)

In [None]:
billing_data = all_data[['Age','HomePlanet','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']]

bin_edges = [0.0,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0]
bin_names = ['0-10','10-20','20-30','30-40','40-50','50-60','60-70','70-80']
billing_data['AgeGroup'] = pd.cut(billing_data['Age'], bin_edges, labels=bin_names)

plt.figure(figsize=(15,8))
plt.subplot(2,3,1)
sns.histplot(billing_data['Age'], kde=True)
plt.subplot(2,3,2)
sns.barplot(x='AgeGroup',y='RoomService', data=billing_data)
plt.subplot(2,3,3)
sns.barplot(x='AgeGroup',y='FoodCourt', data=billing_data)
plt.subplot(2,3,4)
sns.barplot(x='AgeGroup',y='ShoppingMall', data=billing_data)
plt.subplot(2,3,5)
sns.barplot(x='AgeGroup',y='Spa', data=billing_data)
plt.subplot(2,3,6)
sns.barplot(x='AgeGroup',y='VRDeck', data=billing_data)

In [None]:
plt.figure(figsize=(12,6))
ax1 = plt.subplot(2,3,1)
ax1.set_title('Number of VIPs')
pd.crosstab(all_data['HomePlanet'],all_data['VIP']).plot(kind='bar', ax=ax1, rot=0)
ax2 = plt.subplot(2,3,2)
ax2.set_title('RoomService')
sns.barplot(x="HomePlanet", y="RoomService", hue="VIP", data=all_data)
ax3 = plt.subplot(2,3,3)
ax3.set_title('FoodCourt')
sns.barplot(x="HomePlanet", y="FoodCourt", hue="VIP", data=all_data)
ax4 = plt.subplot(2,3,4)
ax4.set_title('ShoppingMall')
sns.barplot(x="HomePlanet", y="ShoppingMall", hue="VIP", data=all_data)
ax5 = plt.subplot(2,3,5)
ax5.set_title('Spa')
sns.barplot(x="HomePlanet", y="Spa", hue="VIP", data=all_data)
ax6 = plt.subplot(2,3,6)
ax6.set_title('VRDeck')
sns.barplot(x="HomePlanet", y="VRDeck", hue="VIP", data=all_data)
plt.tight_layout()

In [None]:
test = all_data[['Age','HomePlanet','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']]

test['Age'].fillna(all_data['Age'].median(), inplace=True)
test['HomePlanet'].fillna(all_data['HomePlanet'].mode()[0], inplace=True)
test['VIP'].fillna(all_data['VIP'].mode()[0], inplace=True)

df = pd.get_dummies(test['HomePlanet'])
test = pd.concat([test, df], axis=1)

In [None]:
fillcolumns = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
for column in fillcolumns:
    columnlist = ['Age','HomePlanet','VIP']
    columnlist.append(column)
    train = all_data[columnlist]
    train.dropna(inplace=True)

    df = pd.get_dummies(train['HomePlanet'])
    train = pd.concat([train,df], axis=1)

    x_test = test[['Age','VIP','Earth','Europa','Mars']]

    y_data = train[column]
    x_data = train.drop([column,'HomePlanet'], axis=1)

    from sklearn.model_selection import train_test_split
    x_train, x_val, y_train, y_val = train_test_split(x_data,y_data,test_size=0.1,random_state=42)

    from xgboost import XGBRegressor

    xgb = XGBRegressor(n_estimators=500, early_stopping_rounds=5)
    xgb.fit(x_train,y_train,eval_set=[(x_val,y_val)])

    result = xgb.predict(x_test)

    x_test[column] = result
    all_data[column].fillna(x_test[column], inplace=True)

In [None]:
all_data.isna().sum()

In [None]:
all_data['Deck'] = all_data['Cabin'].str[0]
all_data['Num'] = all_data['Cabin'].str[2:-2].astype(float)
all_data['Side'] = all_data['Cabin'].str[-1]

all_data = all_data.drop('Name', axis=1)

In [None]:
for col in ['HomePlanet','CryoSleep','Destination','Age','VIP']:
    all_data[col].fillna(all_data[col].mode()[0], inplace=True)

In [None]:
categorical_columns = ['HomePlanet','Destination']#,'Deck','Side']

all_data = pd.get_dummies(all_data, columns=categorical_columns)

In [None]:
train_data = all_data[all_data['Data']=="train"]
test_data = all_data[all_data['Data']=="test"]

train_data = train_data.drop('Data', axis=1)
test_data = test_data.drop(['Data','Transported'], axis=1)

In [None]:
train_data = train_data[train_data['Cabin'].notna()]
train_data = train_data.drop('Cabin', axis=1)
test_data = test_data.drop('Cabin', axis=1)

In [None]:
train_data.to_csv('cleaned_train.csv', index=False)
test_data.to_csv('cleaned_test.csv', index=False)