In [72]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [73]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [74]:
data=pd.read_csv('/content/drive/My Drive/dataset/Titanic.csv')

In [75]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


How to check whether dataset has the missing values or not?

In [76]:
data.info() #1

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [77]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

Only **Age**,**Fare** and **Cabin** Columns has the missing values.  

In [78]:
#drop useless columns
cols=["Ticket","Cabin","Embarked","Name"]
data=data.drop(cols,axis=1)

In [79]:
#label encoding
data=pd.get_dummies(data)

In [80]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male
0,892,0,3,34.5,0,0,7.8292,0,1
1,893,1,3,47.0,1,0,7.0,1,0
2,894,0,2,62.0,0,0,9.6875,0,1
3,895,0,3,27.0,0,0,8.6625,0,1
4,896,1,3,22.0,1,1,12.2875,1,0


In [81]:
x=data.drop(["Survived"],axis=1)
y=data["Survived"]# target column

In [82]:

X_train, X_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

lr = LogisticRegression()
lr.fit(X_train,y_train) #columns with null values can't fit

ValueError: ignored

How to handle missing values?

1. By dropping the rows having missing values usig dropna()

In [83]:
first=data.dropna()
first

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male
0,892,0,3,34.5,0,0,7.8292,0,1
1,893,1,3,47.0,1,0,7.0000,1,0
2,894,0,2,62.0,0,0,9.6875,0,1
3,895,0,3,27.0,0,0,8.6625,0,1
4,896,1,3,22.0,1,1,12.2875,1,0
...,...,...,...,...,...,...,...,...,...
409,1301,1,3,3.0,1,1,13.7750,1,0
411,1303,1,1,37.0,1,0,90.0000,1,0
412,1304,1,3,28.0,0,0,7.7750,1,0
414,1306,1,1,39.0,0,0,108.9000,1,0


In [84]:
print("before dropping null values",len(data),"length after dropping null values",len(first))

before dropping null values 418 length after dropping null values 331


2. You can simply delete the colums if that column is not required for predictions

In [85]:
df=data.drop(["Age","Fare"],axis=1)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,SibSp,Parch,Sex_female,Sex_male
0,892,0,3,0,0,0,1
1,893,1,3,1,0,1,0
2,894,0,2,0,0,0,1
3,895,0,3,0,0,0,1
4,896,1,3,1,1,1,0


First and Second reduces the size of the dataset.

3. Filling the Missing Values – Imputation 
   1. Filling the null values using **mean** ,meadian if its a numerical column
   2. Filling the null values with mod if its a categorical column
   3.Filling it with 0 or any number which is not present in dataset so that machine can recognize the value is not real 

In [86]:
df=data
df["Age"]=df["Age"].fillna(df["Age"].mean()) #For median you can use median()
df["Fare"]=df["Fare"].fillna(df["Fare"].mean())
# df["Cabin"]=df["Cabin"].fillna(df["Cabin"].mean())

Filling null values using **SimpleImputer()**

In [87]:
df = data
df['Age'] = df['Age'].isnull() 
df["Fare"]=df["Fare"].isnull()
from sklearn.impute import SimpleImputer # it can either be mean,mode or median
my_imputer = SimpleImputer(strategy = 'mean')
n_df = my_imputer.fit_transform(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
 2   Pclass       418 non-null    int64
 3   Age          418 non-null    bool 
 4   SibSp        418 non-null    int64
 5   Parch        418 non-null    int64
 6   Fare         418 non-null    bool 
 7   Sex_female   418 non-null    uint8
 8   Sex_male     418 non-null    uint8
dtypes: bool(2), int64(5), uint8(2)
memory usage: 18.1 KB
