In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.

CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

Destination - The planet the passenger will be debarking to.

Age - The age of the passenger.

VIP - Whether the passenger has paid for special VIP service during the voyage.

RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

Name - The first and last names of the passenger.
Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [None]:
import re
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#storing train and test filepaths
train_file='/kaggle/input/spaceship-titanic/train.csv'
test_file='/kaggle/input/spaceship-titanic/train.csv'

In [None]:
# Reading train file
df=pd.read_csv(train_file)
df.head()

In [None]:
#df columns and types
df.dtypes

In [None]:
df.isna().sum()

In [None]:
df.Destination.value_counts()

In [None]:
df.CryoSleep.value_counts()

In [None]:
def process_df(df):
    df['RoomService']=df['RoomService'].fillna(0)
    df['FoodCourt']=df['FoodCourt'].fillna(0)
    df['ShoppingMall']=df['ShoppingMall'].fillna(0)
    df['Spa']=df['Spa'].fillna(0)
    df['VRDeck']=df['VRDeck'].fillna(0)
    
    df['Deck']=list(map(lambda x:x[:1]if type(x)==str else x,df['Cabin']))
    df['Cabin_num']=list(map(lambda x:re.findall('[0-9]+', x)[0] if type(x)==str else x,df['Cabin']))
    
    df['Port']=list(map(lambda x:x[-1:]if type(x)==str else x,df['Cabin']))
    df['HomePlanet']=df['HomePlanet'].astype('category')
    df['Destination']=df['Destination'].astype('category')
    df['CryoSleep']=df['CryoSleep'].map({True:1,False:0})
    df['group']=list(map(lambda x:x[0:4], df['PassengerId']))
    df['Total_Spending']=df['RoomService']+df['FoodCourt']+df['ShoppingMall']+ df['Spa']+ df['VRDeck']
    df['VIP']=df['VIP'].map({True:1,False:0})
    df=df.set_index('PassengerId')
    #df.drop(columns=['Name','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Cabin'],inplace=True)
    
    return df

In [None]:
df=process_df(df)
df.head()

In [None]:
df.Cabin_num.dtype

In [None]:
df.Deck.value_counts()

In [None]:
fig,ax=plt.subplots(3,2,figsize=(12,9))
sns.countplot(data=df,x='HomePlanet', hue='Transported', ax=ax[0,0])
sns.countplot(data=df,x='Destination', hue='Transported', ax=ax[0,1])
sns.countplot(data=df,x='CryoSleep', hue='Transported', ax=ax[1,0])
sns.countplot(data=df,x='VIP', hue='Transported', ax=ax[1,1])
sns.countplot(data=df,x='Deck', hue='Transported', ax=ax[2,0])
sns.countplot(data=df,x='Port', hue='Transported', ax=ax[2,1])

    

In [None]:
sns.kdeplot(data=df,x='Total_Spending', hue="Transported")

In [None]:
sns.kdeplot(data=df,x="Age", hue='Transported')

In [None]:
sns.scatterplot(data=df,x='RoomService', y='FoodCourt', hue='Transported')

In [None]:

sns.scatterplot(data=df,x='RoomService', y='VRDeck', hue='Transported')

In [None]:
sns.scatterplot(data=df,x='Spa', y='VRDeck', hue='Transported')

In [None]:
sns.pairplot(df[df.CryoSleep==0][['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','Transported','Total_Spending']],hue='Transported')

In [None]:
df_no_cryo=df[df.CryoSleep==0]

In [None]:
df.groupby(['CryoSleep','VIP','Transported']).aggregate('count')

People in cryosleep and vip got transported 100%

In [None]:
df.groupby(['Deck','VIP','Transported']).aggregate('count')

In [None]:
df.pivot_table(index=['CryoSleep','VIP','HomePlanet','Transported'])

In [None]:
df.CryoSleep.value_counts()

In [None]:
important_cols=['HomePlanet','CryoSleep','Spa','VRDeck','Destination', 'Age','VIP','FoodCourt',
               'Total_Spending','Transported','group']

In [None]:

df=df[important_cols]

df.isna().sum()

In [None]:
df.describe()

In [None]:
df.pivot_table(index=['group','HomePlanet'], aggfunc='count')

In [None]:
df.pivot_table(index=['group','Destination'], aggfunc='count')

In [None]:
destination_mode=df['Destination'].mode()[0]
homeplanet_mode=df['HomePlanet'].mode()[0]

In [None]:
#function to fill nulls by groups 
def fill_homeplanet(col_group,col):
    if col==col:
       
        return col
        #print(col)
    else:
        
        result=df[df.group==col_group]['HomePlanet'].values
        result=[i for i in result if i==i]
        
        if len(result)>0:
            #print(result[0])
            return(result[0])
        else:
            
            return homeplanet_mode
            

In [None]:
df[df.group=='0119']

In [None]:
df['HomePlanet']=list(map(fill_homeplanet, df['group'],df['HomePlanet']))

In [None]:
#function to fill null destinations by groups 
def fill_destination(col_group,col):
    if col==col:
       
        return col
        #print(col)
    else:
        
        result=df[df.group==col_group]['Destination'].values
        result=[i for i in result if i==i]
        
        if len(result)>0:
            #print(result[0])
            return(result[0])
        else:
            
            return destination_mode

In [None]:
df['Destination']=list(map(fill_destination, df['group'],df['Destination']))

In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
age_mean=df.Age.mean()

In [None]:
df['Age'].fillna(age_mean,inplace=True)

In [None]:
df.dtypes

In [None]:
df.groupby('HomePlanet').CryoSleep.value_counts(1)

In [None]:
df.groupby(['HomePlanet','Destination']).CryoSleep.value_counts(1)

In [None]:
cryosleep_mode=df.CryoSleep.mode()[0]
df.CryoSleep=df.CryoSleep.fillna(cryosleep_mode)

In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
df.groupby(['HomePlanet','Destination']).VIP.value_counts(1)

In [None]:
# Majority of travellers aren't VIP
df.VIP=df.VIP.fillna(0)

In [None]:
df.drop(columns=['group'], inplace=True)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
df=pd.get_dummies(df)
scaler=MinMaxScaler()
X=scaler.fit_transform(df.drop('Transported', axis=1))
Y=df.Transported

In [None]:
X

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,Y, random_state=22, test_size=0.2)

In [None]:
from sklearn.model_selection import cross_val_score


In [None]:
def Val_score(n_neighbors):
    '''
  takes range of n_neighbors as input
  returns Mean and Standard Deviation for each value of n_neighbors
  '''
    avg = []
    std = []
  
    for i in n_neighbors:
    
    # 10 fold cross validation for every value of n_neighbor
      score = cross_val_score( KNeighborsClassifier(n_neighbors = i) , X = X_train, y = y_train, cv = 10)
    
    # adding mean to avg list
      avg.append(score.mean())
    
    # adding standard deviation to std list
      std.append(score.std())
    
    return avg, std

In [None]:
n_neighbors = range(1,50)
mean, std = Val_score(n_neighbors)

In [None]:
plt.plot(n_neighbors, mean, color = 'green', label = 'mean' )
#plt.plot(n_neighbors, std, color = 'blue', label = 'mean' )
plt.xlabel('n_neighbors')
plt.ylabel('Mean Score')
plt.title('Mean Validation score')

In [None]:
knn=KNeighborsClassifier(n_neighbors=10)

In [None]:
knn.fit(X_train,y_train)

In [None]:
knn.score(X_test,y_test)

In [None]:
y_pred=knn.predict(X_test)


In [None]:
print(classification_report(y_test,y_pred))

In [None]:
test_file='/kaggle/input/spaceship-titanic/test.csv'
df_test=pd.read_csv(test_file)

In [None]:
df_test.isna().sum()

In [None]:
important_cols.remove('Transported')

In [None]:
df_test=process_df(df_test)


In [None]:
df_test=df_test[important_cols]

In [None]:
df_test.isna().sum()

In [None]:
df_test

In [None]:
#function to fill null destinations by groups 
def fill_destination2(col_group,col):
    if col==col:
       
        return col
        #print(col)
    else:
        
        result=df_test[df_test.group==col_group]['Destination'].values
        result=[i for i in result if i==i]
        
        if len(result)>0:
            #print(result[0])
            return(result[0])
        else:
            
            return destination_mode

In [None]:
#function to fill nulls by groups 
def fill_homeplanet2(col_group,col):
    if col==col:
       
        return col
        #print(col)
    else:
        
        result=df_test[df_test.group==col_group]['HomePlanet'].values
        result=[i for i in result if i==i]
        
        if len(result)>0:
            #print(result[0])
            return(result[0])
        else:
            
            return homeplanet_mode
            

In [None]:
df_test['HomePlanet']=list(map(fill_homeplanet2, df_test['group'],df_test['HomePlanet']))
df_test['Destination']=list(map(fill_destination2, df_test['group'],df_test['Destination']))
df_test.CryoSleep=df_test.CryoSleep.fillna(cryosleep_mode)
df_test.VIP=df_test.VIP.fillna(0)
df_test.Age=df_test.Age.fillna(age_mean)
df_test.isna().sum()
df_test.drop(columns=['group'], inplace=True)

In [None]:
df_test

In [None]:
df_test=pd.get_dummies(df_test)
scaler=MinMaxScaler()
X=scaler.fit_transform(df_test)


In [None]:
X.shape

In [None]:
y_pred=knn.predict(X)

In [None]:
df_test

In [None]:
df_test.shape, len(y_pred)

In [None]:
df_submission=df_test.copy()

In [None]:
df_submission['Transported']=y_pred

In [None]:
df_submission=df_submission[['Transported']]

In [None]:
df_submission

In [None]:
df_submission.to_csv('submission1.csv')