# Imports

In [86]:
import numpy as np
import pandas as pd 
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score

from IPython.display import clear_output

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split

from lightgbm import LGBMClassifier
import lazypredict
from lazypredict.Supervised import LazyClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


# Data readings

In [2]:
train=pd.read_csv("C:/Users/burka/OneDrive/Masaüstü/22-23 Güz/veri bilimi/reading_data/train_titanic.csv")
test=pd.read_csv("C:/Users/burka/OneDrive/Masaüstü/22-23 Güz/veri bilimi/reading_data/test_titanic.csv")



In [3]:
print(train.isnull().sum())  #Nan values in train 

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64


# Before start modeling we neeed to fill Nan values in training with correct strategies
    -HomePlanet = Weighted average
    -CryoSleep  = Weighted average
    -Destination  = Weighted average
     -Age          =  average
    -VIP          =  average
    -RoomService  =  average
    -FoodCourt    =  average
    -ShoppingMall =  average
    -Spa          =  average
    -VRDeck       =  average 
    -Name         = Unknown_Name(it is not that important)    
- Cabin value consist 3 informations we can purify these datas and drop this column

# Fill CryoSleep

In [4]:
train['CryoSleep'].value_counts()

False    5439
True     3037
Name: CryoSleep, dtype: int64

In [5]:
train['CryoSleep'].isna().sum()

217

In [6]:
print(round(((5439)/(5439+3037))*217),",",217-139) #The first value equals to number of False values filled and the second is True

139 , 78


In [7]:
values = {"CryoSleep": True}
train=train.fillna(value=values, limit=139)
values2 = {"CryoSleep": False}
train=train.fillna(value=values, limit=78)
train['CryoSleep'].isna().sum()

0

## Fill HomePlanet 

In [8]:
train['HomePlanet'].value_counts()
b=(4602+2131+1759)
list=[round((4602/b)*201) , round((2131/b)*201) , round((1759/b)*201)]
values3 = {"HomePlanet": "Earth"}
values4 = {"HomePlanet": "Europa" }
values5 = {"HomePlanet": "Mars"}
train=train.fillna(value=values3, limit=109)
train=train.fillna(value=values4, limit=50)
train=train.fillna(value=values5, limit=42)
train['HomePlanet'].isna().sum()

0

# Fill Destination

In [9]:
train['Destination'].value_counts()

TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: Destination, dtype: int64

In [10]:
c=(5788+1750+778)
list=[round((5788/c)*178) , round((1750/c)*178) , round((778/c)*178)]
values6 = {"Destination": "TRAPPIST-1e"}
values7 = {"Destination": "55 Cancri e" }
values8 = {"Destination": "PSO J318.5-22"}
train=train.fillna(value=values6, limit=124)
train=train.fillna(value=values7, limit=37)
train=train.fillna(value=values8, limit=21)
train['Destination'].isna().sum()

0

In [11]:
train=train.dropna(subset=['Cabin'])


In [12]:
train["Passenger_Deck"]=train["Cabin"].str[0:1]
train["Passenger_num"]=train["Cabin"].str[2:3]
train["Passenger_side"]=train["Cabin"].str[4:5]


In [13]:
train=train.drop(['PassengerId',"Cabin","Name"], axis=1)

In [14]:
print(train.isnull().sum())  #Nan values in train 

HomePlanet          0
CryoSleep           0
Destination         0
Age               175
VIP               197
RoomService       177
FoodCourt         178
ShoppingMall      206
Spa               181
VRDeck            184
Transported         0
Passenger_Deck      0
Passenger_num       0
Passenger_side      0
dtype: int64


# Fill age


In [15]:
round(train["Age"].mean())

29

In [16]:
train["Age"] = train['Age'].fillna(29)

# Fill VIP

In [17]:
train['VIP'].value_counts()

False    8104
True      193
Name: VIP, dtype: int64

In [18]:
round(8104/(8104+193)*197)

192

In [19]:
values14 = {"VIP": "False"}
values15 = {"VIP": "False" }
train=train.fillna(value=values14, limit=192)
train=train.fillna(value=values15, limit=5)

In [20]:
display(train.loc[(train['CryoSleep']==True) & (train['RoomService'].isna()),
                    ["RoomService"]])

Unnamed: 0,RoomService
25,
83,
233,
400,
889,
...,...
8312,
8361,
8380,
8412,


# if 'CryoSleep'==True then RoomService and ShoppingMall and Spa and VRDeck should be equal 0 because if you are sleeping you can not spend money

In [21]:
train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Passenger_Deck,Passenger_num,Passenger_side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S


In [22]:
Expenses_columns = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
train.loc[:,Expenses_columns]=train.apply(lambda x: 0 if x.CryoSleep == True else x,axis =1)


In [23]:
display(train.loc[(train['CryoSleep']==True) & (train['RoomService'].isna()),
                    ["RoomService"]])

Unnamed: 0,RoomService


# Let's fill missing purchase values with mean values

In [24]:
round(train["RoomService"].mean())
train["RoomService"] = train['RoomService'].fillna(215)

round(train["FoodCourt"].mean())
train["FoodCourt"] = train['FoodCourt'].fillna(443)

round(train["ShoppingMall"].mean())
train["ShoppingMall"] = train['ShoppingMall'].fillna(169)

round(train["Spa"].mean())
train["Spa"] = train['Spa'].fillna(297)

round(train["VRDeck"].mean())
train["VRDeck"] = train['VRDeck'].fillna(296)

In [25]:
print(train.isnull().sum())

HomePlanet        0
CryoSleep         0
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Transported       0
Passenger_Deck    0
Passenger_num     0
Passenger_side    0
dtype: int64


# Finally, we have 0 NaN,lets chechk their dtypes


In [26]:
train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Passenger_Deck,Passenger_num,Passenger_side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S


In [27]:
train.dtypes

HomePlanet         object
CryoSleep            bool
Destination        object
Age               float64
VIP                object
RoomService       float64
FoodCourt         float64
ShoppingMall      float64
Spa               float64
VRDeck            float64
Transported          bool
Passenger_Deck     object
Passenger_num      object
Passenger_side     object
dtype: object

- Object types might cause some problems in future

# Before starting encoding categorical values we need to apply same manipulations to test data

In [28]:
print(test.isnull().sum())  #Nan values in test 

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64


In [29]:
test=test.dropna(subset=['Cabin'])

In [30]:
test["Passenger_Deck"]=test["Cabin"].str[0:1]
test["Passenger_num"]=test["Cabin"].str[2:3]
test["Passenger_side"]=test["Cabin"].str[4:5]


# Fill age


In [31]:
round(test["Age"].mean())

29

In [32]:
test["Age"] = test['Age'].fillna(29)

In [33]:

test["RoomService"] = test['RoomService'].fillna(round(test["RoomService"].mean()))

test["FoodCourt"] = test['FoodCourt'].fillna(round(test["FoodCourt"].mean()))


test["ShoppingMall"] = test['ShoppingMall'].fillna(round(test["ShoppingMall"].mean()))


test["Spa"] = test['Spa'].fillna(round(test["Spa"].mean()))


test["VRDeck"] = test['VRDeck'].fillna(round(test["VRDeck"].mean()))

In [34]:
test=test.drop(['PassengerId',"Cabin","Name"], axis=1)

In [35]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4177 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   HomePlanet      4091 non-null   object 
 1   CryoSleep       4087 non-null   object 
 2   Destination     4085 non-null   object 
 3   Age             4177 non-null   float64
 4   VIP             4089 non-null   object 
 5   RoomService     4177 non-null   float64
 6   FoodCourt       4177 non-null   float64
 7   ShoppingMall    4177 non-null   float64
 8   Spa             4177 non-null   float64
 9   VRDeck          4177 non-null   float64
 10  Passenger_Deck  4177 non-null   object 
 11  Passenger_num   4177 non-null   object 
 12  Passenger_side  4177 non-null   object 
dtypes: float64(6), object(7)
memory usage: 456.9+ KB


In [36]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8494 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   HomePlanet      8494 non-null   object 
 1   CryoSleep       8494 non-null   bool   
 2   Destination     8494 non-null   object 
 3   Age             8494 non-null   float64
 4   VIP             8494 non-null   object 
 5   RoomService     8494 non-null   float64
 6   FoodCourt       8494 non-null   float64
 7   ShoppingMall    8494 non-null   float64
 8   Spa             8494 non-null   float64
 9   VRDeck          8494 non-null   float64
 10  Transported     8494 non-null   bool   
 11  Passenger_Deck  8494 non-null   object 
 12  Passenger_num   8494 non-null   object 
 13  Passenger_side  8494 non-null   object 
dtypes: bool(2), float64(6), object(6)
memory usage: 879.3+ KB


In [70]:
train['Passenger_num']=train['Passenger_num'].astype(str).astype(int)
test['Passenger_num']=test['Passenger_num'].astype(str).astype(int)


In [67]:
train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Passenger_Deck,Passenger_num,Passenger_side
0,1,0,2,39.0,0,0.0,0.0,0.0,0.0,0.0,False,1,0,11
1,0,0,2,24.0,0,109.0,9.0,25.0,549.0,44.0,True,5,0,12
2,1,0,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False,0,0,12
3,1,0,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,False,0,0,12
4,0,0,2,16.0,0,303.0,70.0,151.0,565.0,2.0,True,5,1,12


# Encoding datas


In [41]:
label_cols = ["HomePlanet", "CryoSleep", "Destination" ,"VIP","Passenger_Deck","Passenger_side"]
def label_encoder(train,test,columns):
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = LabelEncoder().fit_transform(train[col])
        test[col] =  LabelEncoder().fit_transform(test[col])
    return train, test

train ,test = label_encoder(train,test ,label_cols)

In [77]:
train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Passenger_Deck,Passenger_num,Passenger_side
0,1,0,2,39.0,0,0.0,0.0,0.0,0.0,0.0,False,1,0,11
1,0,0,2,24.0,0,109.0,9.0,25.0,549.0,44.0,True,5,0,12
2,1,0,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False,0,0,12
3,1,0,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,False,0,0,12
4,0,0,2,16.0,0,303.0,70.0,151.0,565.0,2.0,True,5,1,12


In [78]:
train['Passenger_num'] = train['Passenger_num'].astype(str).astype(int)
test['Passenger_num'] = test['Passenger_num'].astype(str).astype(int)

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8494 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   HomePlanet      8494 non-null   int32  
 1   CryoSleep       8494 non-null   int32  
 2   Destination     8494 non-null   int32  
 3   Age             8494 non-null   float64
 4   VIP             8494 non-null   int32  
 5   RoomService     8494 non-null   float64
 6   FoodCourt       8494 non-null   float64
 7   ShoppingMall    8494 non-null   float64
 8   Spa             8494 non-null   float64
 9   VRDeck          8494 non-null   float64
 10  Transported     8494 non-null   bool   
 11  Passenger_Deck  8494 non-null   int32  
 12  Passenger_num   8494 non-null   int32  
 13  Passenger_side  8494 non-null   int32  
dtypes: bool(1), float64(6), int32(7)
memory usage: 705.1 KB


In [79]:
TARGET = 'Transported'

X = train.drop(TARGET , axis =1 )
y = train[TARGET]
X_train , X_test , y_train , y_test = train_test_split(X , 
                                                       y, 
                                                       random_state = 12 ,
                                                       test_size =0.33)

In [80]:
clf = LazyClassifier(verbose=0,
                     ignore_warnings=True,
                     custom_metric=None,
                     predictions=False,
                     random_state=12,
                     classifiers='all')

models, predictions = clf.fit(X_train , X_test , y_train , y_test)
clear_output()

In [53]:
models[:10]

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.8,0.8,0.8,0.8,0.18
AdaBoostClassifier,0.79,0.79,0.79,0.79,0.25
SVC,0.79,0.79,0.79,0.79,1.88
NuSVC,0.79,0.79,0.79,0.79,2.19
XGBClassifier,0.79,0.79,0.79,0.79,0.25
RandomForestClassifier,0.78,0.78,0.78,0.78,0.58
CalibratedClassifierCV,0.78,0.78,0.78,0.78,1.25
LinearSVC,0.78,0.78,0.78,0.78,0.3
LogisticRegression,0.78,0.78,0.78,0.78,0.05
SGDClassifier,0.78,0.78,0.78,0.78,0.05


In [81]:
FEATURES = [col for col in train.columns if col != TARGET]


In [84]:
#!conda install -c conda-forge lightgbm


In [82]:
from lightgbm import LGBMClassifier


In [83]:
lgbm_model = LGBMClassifier().fit(X_train, y_train)

In [87]:
y_pred = lgbm_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7956490727532097

In [88]:
lgbm_params = {
        'n_estimators': [100, 500, 1000, 2000],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,6],
        'learning_rate': [0.1,0.01,0.02,0.05],
        "min_child_samples": [5,10,20]}

In [91]:
lgbm = LGBMClassifier()

lgbm_cv_model = GridSearchCV(lgbm, lgbm_params, 
                             cv = 3, 
                             n_jobs = -1, 
                             verbose = 2)



In [92]:
lgbm_cv_model.fit(X_train, y_train)

Fitting 3 folds for each of 576 candidates, totalling 1728 fits


GridSearchCV(cv=3, estimator=LGBMClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.01, 0.02, 0.05],
                         'max_depth': [3, 4, 5, 6],
                         'min_child_samples': [5, 10, 20],
                         'n_estimators': [100, 500, 1000, 2000],
                         'subsample': [0.6, 0.8, 1.0]},
             verbose=2)

In [94]:
lgbm_cv_model.best_params_

{'learning_rate': 0.02,
 'max_depth': 5,
 'min_child_samples': 10,
 'n_estimators': 500,
 'subsample': 0.6}

In [96]:
lgbm = LGBMClassifier(learning_rate =0.02, 
                       max_depth = 5,
                       subsample = 0.6,
                       n_estimators = 500,
                       min_child_samples = 10)

In [97]:
lgbm_tuned = lgbm.fit(X_train,y_train)

In [98]:
y_pred = lgbm_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

0.7999286733238231

In [150]:
tahmin

array([False, False,  True, ...,  True,  True,  True])

In [106]:
tahmin=lgbm_tuned.predict(test)

In [132]:
tahmin.size

4177

In [163]:
submission=pd.read_csv("C:/Users/burka/OneDrive/Masaüstü/22-23 Güz/veri bilimi/reading_data/sample_submission_titan.csv")

In [164]:
submission.tail()

Unnamed: 0,PassengerId,Transported
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False
4276,9277_01,False


In [147]:
submission["Transported"] = tahmin


In [148]:
submission.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [151]:
submission.to_csv("submission.csv",index=False)


In [152]:
submission.tail()

Unnamed: 0,PassengerId,Transported
4172,9053_02,True
4173,9054_01,True
4174,9055_01,True
4175,9056_01,True
4176,9058_01,True
