In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
import numpy as np

In [None]:
df = pd.read_csv('/content/drive/MyDrive/12th Grade/Machine Learning/Data/Spaceship Titanic/spaceshiptrain.csv')

In [None]:
df.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [None]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [None]:
((df['RoomService'] == 0) & (df['FoodCourt'] == 0)).sum()/8693


0.46037041297595765

In [None]:
(df['RoomService'].isna()).sum()

181

In [None]:
df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [None]:
df.shape

(8693, 14)

In [None]:
df['Transported'].sum()/8693

0.5036236051995858

In [None]:
df.loc[df['VIP'].fillna(False), 'Transported'].sum()/199

0.38190954773869346

In [None]:
df['CryoSleep'].sum()

3037

In [None]:
df.loc[df['CryoSleep'].fillna(False), 'Transported'].sum()/3037

0.8175831412578202

In [None]:
df.fillna('missing').groupby('HomePlanet')['Transported'].sum()/df.fillna('missing')['HomePlanet'].value_counts()

HomePlanet
Earth      0.423946
Europa     0.658846
Mars       0.523024
missing    0.512438
dtype: float64

In [None]:
df['HomePlanet'].value_counts()

Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64

In [None]:
df.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [None]:
df.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [None]:
df2 = df.copy()

In [None]:
df2['Group'] = df2['PassengerId'].str.split('_').str[0].astype(int)
df2['Num'] = df2['PassengerId'].str.split('_').str[1].astype(int)

In [None]:
df2['Deck'] = df2['Cabin'].str.split('/').str[0].fillna('missing')
df2['CabNum'] = df2['Cabin'].str.split('/').str[1].fillna(2).astype(int)
df2['Side'] = df2['Cabin'].str.split('/').str[2].fillna('missing')

In [None]:
df2['HomePlanet'] = df2['HomePlanet'].fillna('Mars')

In [None]:
def fill_condition(row):
    if pd.isnull(row['CryoSleep']):
        return True if (row['RoomService'] + row['FoodCourt'] + row['ShoppingMall'] + row['Spa'] + row['VRDeck']) == 0 else False
    else:
        return row['CryoSleep']
df2['CryoSleep'] = df2.apply(fill_condition, axis=1)

In [None]:
df2['VIP'] = df2['VIP'].fillna(False)

In [None]:
df2['Luxuries'] = df2['RoomService'] + df2['Spa'] + df2['VRDeck']
df2['Necessities'] = df2['FoodCourt'] + df2['ShoppingMall']

In [None]:
df2.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,VRDeck,Name,Transported,Group,Num,Deck,CabNum,Side,Luxuries,Necessities
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,0.0,Maham Ofracculy,False,1,1,B,0,P,0.0,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,44.0,Juanna Vines,True,2,1,F,0,S,702.0,34.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,49.0,Altark Susent,False,3,1,A,0,S,6807.0,3576.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,193.0,Solam Susent,False,3,2,A,0,S,3522.0,1654.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,2.0,Willy Santantines,True,4,1,F,1,S,870.0,221.0


In [None]:
features = ['HomePlanet', 'CryoSleep', 'Age', 'VIP', 'Luxuries', 'Necessities', 'Group', 'Num', 'Deck', 'CabNum', 'Side']

In [None]:
y = df2['Transported']
X = df2[features]

In [None]:
imp_money = SimpleImputer(strategy = 'constant', fill_value=0)

In [None]:
imp_age = SimpleImputer(strategy = 'median')

In [None]:
ohe = OneHotEncoder()

In [None]:
ct = make_column_transformer((ohe, ['HomePlanet', 'Deck', 'Side']),
                             (imp_money, ['Luxuries', 'Necessities']),
                             (imp_age, ['Age']),
                             remainder = 'passthrough')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

lr = LogisticRegression()
pipe = make_pipeline(ct, lr)

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.7703220858895705


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state = 42)
pipe2 = make_pipeline(ct, rf_model)
pipe2.fit(X_train, y_train)

y_pred2 = pipe2.predict(X_test)
print(accuracy_score(y_test, y_pred2))

0.7940950920245399


In [None]:
df_new = pd.read_csv('/content/drive/MyDrive/12th Grade/Machine Learning/Data/Spaceship Titanic/spaceshiptest.csv')

In [None]:
df3 = df_new.copy()

In [None]:
df3['Group'] = df3['PassengerId'].str.split('_').str[0].astype(int)
df3['Num'] = df3['PassengerId'].str.split('_').str[1].astype(int)

In [None]:
df3['Deck'] = df3['Cabin'].str.split('/').str[0].fillna('missing')
df3['CabNum'] = df3['Cabin'].str.split('/').str[1].fillna(2).astype(int)
df3['Side'] = df3['Cabin'].str.split('/').str[2].fillna('missing')

In [None]:
df3['HomePlanet'] = df3['HomePlanet'].fillna('Mars')

In [None]:
df3['CryoSleep'] = df3.apply(fill_condition, axis=1)

In [None]:
df3['VIP'] = df3['VIP'].fillna(False)

In [None]:
df3['Luxuries'] = df3['RoomService'] + df3['Spa'] + df3['VRDeck']
df3['Necessities'] = df3['FoodCourt'] + df3['ShoppingMall']

In [None]:
X_new = df3[features]

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=4)
pipe = make_pipeline(ct, rf_model)
pipe.fit(X, y)
y_pred = pipe.predict(X_new)

In [None]:
res = pd.DataFrame({'PassengerId': df3['PassengerId'], 'Transported': y_pred})
res.shape

(4277, 2)

In [None]:
res.to_csv('/content/drive/MyDrive/12th Grade/Machine Learning/Data/Spaceship Titanic/result3.csv', index=False)