Conclusions after data exploration through graphs and checking the dataframe:

- VIP matters
    * ~50% of Non-VIPs are transported
    * ~38% of VIPs are transported
    * Earth implies Non-VIP
    * But its not a high priority parameter to be transported
        * Europe -> ~65% Transported
        * Earth -> ~41% Transported -> HomePlanet with the highest amount of habitantes (3566)
        * Mars -> ~53% Transported
    *~2.4% of passengers are VIPs
    * age <= 17 implies Non-Vip
- Using cabins to infer HomePlanet
    * A B C T -> Europa
    * G -> Earth
    * F -> 60% Earth, 40% Mars
    * E -> 45% Earth, 40% Mars, 15% Europa
    * D -> 60% Mars, 40% Europa
- Cryosleep True or age <= 12 implies no expenses
- Any expense implies Cryosleep False
- Same group -> Same HomePlanet
- Same family -> Same HomePlanet
- Each HomePlanet has a favorite destination:
    * Earth -> PSO
    * Europa -> Cancri
    * Mars -> TRAPPIST
- Cabin T implies Non-Vip 
- Cabin T's destination is TRAPPIST
- Cabin T implies Cryosleep False
- If a person belongs to Mars and is in Cryosleep, then Non-VIP
- If a person is in Cryosleep and is a VIP, then belongs to Europa
    
    

In [46]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

pd.options.mode.chained_assignment = None

In [47]:
cabinDict = {"A": "Europa", "B": "Europa", "C": "Europa", "T": "Europa", "G": "Earth", "F": "Earth", "D": "Mars", "E": "Earth"}
planetDict = {"Earth" : "PSO J318.5-22", "Europa" : "55 Cancri e", "Mars" : "TRAPPIST-1e"} #Preferred destination for each HomePlanet
cabsideDict = {"Earth": "P", "Mars": "P", "Europa": "S"}

class Dataframe:
    def __init__(self, df):
        self.df = df

        self.groupDict = None # maps a group to a set of homeplanets
        self.nameDict = None # maps a surname to a set of homeplanets
        self.CabSideDict = None # maps a cabinside to a set of homeplanets
        
        self.X = None
        


    def setExpensesToZero(self, index):
        self.df['RoomService'][index] = 0 
        self.df['FoodCourt'][index] = 0 
        self.df['ShoppingMall'][index] = 0 
        self.df['VRDeck'][index] = 0 

    def CryoOr12Expenses(self):
        for i in range(len(self.df)):
            if (not pd.isna(self.df['CryoSleep'][i]) and self.df['CryoSleep'][i] == True) or (not pd.isna(self.df['Age'][i]) and self.df['Age'][i] <= 12):
                self.setExpensesToZero(i)


    def checkForExpenses(self, i):
        if (not pd.isna(self.df['RoomService'][i]) and self.df['RoomService'][i] > 0) or (not pd.isna(self.df['FoodCourt'][i]) and self.df['FoodCourt'][i] > 0) or (not pd.isna(self.df['ShoppingMall'][i]) and self.df['ShoppingMall'][i] > 0) or (not pd.isna(self.df['VRDeck'][i]) and self.df['VRDeck'][i] > 0):
            return 0.0
        return 1.0

    def buildCabinColumns(self):
        cabins = []
        cabside = []
        for i in range(len(self.df)):
            if (pd.isna(self.df['Cabin'][i])):
                cabins.append(None)
                cabside.append(None)
            else:
                cabins.append(str(self.df['Cabin'][i][0]))
                cabside.append(str(self.df['Cabin'][i][-1]))
        self.df['Cabin0'] = cabins
        self.df['CabinSide'] = cabside

    def buildDict(self, keyColumn, valueColumn, char, pos):
        dictionary = {}
        for i in range(len(self.df)):
            try:
                current = self.df[keyColumn][i].split(char)[pos]
                if current not in dictionary:
                    dictionary[current] = set()
                if pd.isnull(self.df[valueColumn][i]) == False:
                    dictionary[current].add(self.df[valueColumn][i])
            except AttributeError:
                continue
        return dictionary

    def removeEmptySets(self, dict):
        deletions = []
        for key in dict.keys():
            if len(dict[key])==0:
                deletions.append(key)
        for key in deletions:
            del dict[key]
        return dict

    def formatDictionary(self, dict):
        for el in dict.keys():
            for k in dict[el]:
                dict[el] = k
        return dict

    def buildFormattedDict(self, keyColumn, valueColumn, char, pos):
        dictionary = self.buildDict(keyColumn, valueColumn, char, pos)
        dictionary = self.removeEmptySets(dictionary)
        return self.formatDictionary(dictionary)

    def setGroupDict(self):
        self.groupDict = self.buildFormattedDict('PassengerId', 'HomePlanet', '_', 0)

    def setNameDict(self):
        self.nameDict = self.buildFormattedDict('Name', 'HomePlanet', ' ', 1)

    def setCabSideDict(self):
        self.CabSideDict = self.buildFormattedDict('PassengerId','CabinSide', '_', 0)

    def fillCabSideWFavorite(self):
        for i in range(len(self.df)):
            if (pd.isna(self.df['CabinSide'][i])):
                self.df['CabinSide'][i] = cabsideDict[self.df['HomePlanet'][i]]        

    def getGroup(self, index):
        return self.df['PassengerId'][index].split('_')[0]


    def getSurname(self, index):
        if pd.isna(self.df['Name'][index]):
            return None
        return self.df['Name'][index].split(' ')[1]


    def getCabin(self, index):
        if pd.isna(self.df['Cabin0'][index]):
            return None
        return self.df['Name'][index].split(' ')[1]

    def buildGroupsNSurnames(self):
        groups = []
        surnames = []
        for i in range(len(self.df)):
            groups.append(self.getGroup(i))
            surnames.append(self.getSurname(i))
        self.df['Groups'] = groups
        self.df['Surname'] = surnames

    def fillHomePlanet(self, groupDict, nameDict):
        self.df['HomePlanet'] = self.df['HomePlanet'].fillna(self.df['Groups'].map(self.groupDict)) 
        self.df['HomePlanet'] = self.df['HomePlanet'].fillna(self.df['Surname'].map(self.nameDict))
        self.df['HomePlanet'] = self.df['HomePlanet'].fillna(self.df['Cabin0'].map(cabinDict))

    def fillDestWFavorite(self):
        self.df['Destination'] = self.df['Destination'].fillna(self.df['HomePlanet'].map(planetDict))

    def fillCostsWMedian(self):
        self.df['RoomService'].fillna(self.df['RoomService'].median(), inplace=True)
        self.df['Spa'].fillna(self.df['Spa'].median(), inplace=True)
        self.df['VRDeck'].fillna(self.df['VRDeck'].median(), inplace=True)
        self.df['FoodCourt'].fillna(self.df['FoodCourt'].median(), inplace=True)
        self.df['ShoppingMall'].fillna(self.df['ShoppingMall'].median(), inplace=True)

    def fillWAvgAge(self):
        sumEarth = 0
        indexEarth = 0
        sumEuropa = 0
        indexEuropa = 0
        sumMars = 0
        indexMars= 0

        for i in range(len(self.df)):
            if not pd.isna(self.df['HomePlanet'][i]) and not pd.isna(self.df['Age'][i]):
                if self.df['HomePlanet'][i] == 'Earth':
                    indexEarth += 1
                    sumEarth += self.df['Age'][i]
                elif self.df['HomePlanet'][i] == 'Europa':
                    indexEuropa += 1
                    sumEuropa += self.df['Age'][i]
                elif self.df['HomePlanet'][i] == 'Mars':
                    indexMars += 1
                    sumMars += self.df['Age'][i]

        EarthAgeAvg = sumEarth/indexEarth
        EuropaAgeAvg = sumEuropa/indexEuropa
        MarsAgeAvg = sumMars/indexMars

        for i in range(len(self.df)):
            if pd.isna(self.df['Age'][i]):
                if pd.isna(self.df['HomePlanet'][i]):
                    continue
                elif self.df['HomePlanet'][i] == "Earth":
                    self.df['Age'][i] = EarthAgeAvg
                elif self.df['HomePlanet'][i] == "Europa":
                    self.df['Age'][i] = EuropaAgeAvg
                else:
                    self.df['Age'][i] = MarsAgeAvg

    def formatTransported(self):
        self.df['Transported'] = self.df['Transported'].map({False: 0, True: 1})

    def formatHomePlanet(self):   
        self.df['Earth'] = np.select([self.df['HomePlanet'] == 'Earth'], [1], default = 0)
        self.df['Europa'] = np.select([self.df['HomePlanet'] == 'Europa'], [1], default = 0)
        self.df['Mars'] = np.select([self.df['HomePlanet'] == 'Mars'], [1], default = 0)

    def buildCryoColumn(self):
        cryocolumn = []
        for i in range (len(self.df)):
            if pd.isna(self.df['CryoSleep'][i]) and (not pd.isna(self.df['Age'][i]) and self.df['Age'][i] > 12):
                cryocolumn.append(self.checkForExpenses(i))
            elif self.df['CryoSleep'][i] == True:
                cryocolumn.append(1)
            else:
                cryocolumn.append(0)
        self.df['CryoSleep'] = cryocolumn

    def formatDestination(self):
        self.df['TRAP'] = np.select([self.df['Destination'] == 'TRAPPIST-1e'], [1], default = 0)
        self.df['PSO'] = np.select([self.df['Destination'] == 'PSO J318.5-22'], [1], default = 0)
        self.df['Cancri'] = np.select([self.df['Destination'] == '55 Cancri e'], [1], default = 0)

    def formatVip(self):
        self.df['VIP'] = np.select([self.df['VIP'] == True], [1], default = 0)

    def formatCabinSide(self):
        self.df['CabinSide'] = self.df['CabinSide'].map({'S': 0, 'P': 1})

    def buildX(self):
        self.X = np.array(self.df.drop(columns=['Name', 'Surname', 'Cabin', 'Cabin0', 'HomePlanet', 'Destination', 'Groups', 'PassengerId','Transported'], axis=1))

    def buildY(self):
        self.y = np.array(self.df['Transported'])

    
    def DataTreatment(self):
        self.buildCabinColumns()

        self.setGroupDict()
        self.setNameDict()
        self.setCabSideDict()
        
        self.buildGroupsNSurnames()
        self.CryoOr12Expenses()

        self.fillHomePlanet(self.groupDict, self.nameDict)
        self.fillDestWFavorite()

        self.fillCostsWMedian()

        self.fillWAvgAge()

        self.formatHomePlanet()
        self.buildCryoColumn()


        self.formatVip()
        self.formatDestination()
        self.fillCabSideWFavorite()
        self.formatCabinSide()
        
class Model(Dataframe):
    def __init__(self, df):
        super().__init__(df)
        self.y = None
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None
        self.predictions = None

    def buildTestAndTrainArrays(self, testSize = 0.4):
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.X, self.y, random_state = 13, test_size = testSize)

    def processRandomForest(self):
        self.predictions = cross_val_predict(RandomForestClassifier(random_state = 1), self.x_train, self.y_train, cv = 10, n_jobs=-1)

    def processMachineLearning(self):
        self.buildX()
        self.buildY()
        self.buildTestAndTrainArrays()
        self.processRandomForest()

    def computeAccuracy(self):
        accuracy = round(accuracy_score(self.y_train, self.predictions)*100, 2)
        return accuracy

    

class TestDF(Dataframe):
    def __init__(self, df):
        super().__init__(df)

    def buildX(self):
        self.X = np.array(self.df.drop(columns=['Name', 'Surname', 'Cabin', 'Cabin0', 'HomePlanet', 'Destination', 'Groups', 'PassengerId'], axis=1))
        
        


In [48]:
model = Model(pd.read_csv("train.csv"))
model.DataTreatment()
model.processMachineLearning()
print(model.computeAccuracy())

78.12


In [49]:
#Submission block
full_data_model = RandomForestClassifier(random_state = 13)
full_data_model.fit(model.X, model.y) #using entire train dataset


testDF = TestDF(pd.read_csv('test.csv'))
testDF.DataTreatment()
testDF.buildX()

passId = testDF.df['PassengerId'] 
test_predictions = full_data_model.predict(testDF.X)
output = pd.DataFrame({'PassengerId': passId, 'Transported': test_predictions})
output['Transported'] = output['Transported'].astype(bool)
output
output.to_csv('submission.csv', index=False)