# Testing & Training Accuracy

In [376]:
import pandas as pd
import seaborn as sns

In [377]:
# sns.get_dataset_names()

In [378]:
data = sns.load_dataset('titanic')
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### Data Preprocessing

In [380]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [381]:
data.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [382]:
data.duplicated().sum()

107

In [383]:
#drop duplicates
data = data.drop_duplicates()

In [384]:
data.duplicated().sum()

0

In [385]:
#check null values
data.isna().sum()

survived         0
pclass           0
sex              0
age            106
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           582
embark_town      2
alive            0
alone            0
dtype: int64

In [386]:
#Handling Missing values
data['age']=data['age'].fillna(data['age'].mean())  
#drop coloumn with too many missing values
data.drop(columns=['deck'], inplace=True)
#Fill with the most frequent value or a placeholder:
data['embarked']=data['embarked'].fillna(data['embarked'].mode()[0])
data['embark_town']=data['embark_town'].fillna(data['embark_town'].mode()[0])

In [387]:
data.isna().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [388]:
#Do some analysis on the target variable
data['survived'].value_counts()

survived
0    461
1    323
Name: count, dtype: int64

In [389]:
#class vs survived analysis
data.groupby('pclass')['survived'].value_counts()

pclass  survived
1       1           135
        0            79
2       1            84
        0            81
3       0           301
        1           104
Name: count, dtype: int64

In [390]:
#pclass vs fare
data.groupby('pclass')['fare'].mean()

pclass
1    84.487812
2    21.835404
3    13.656223
Name: fare, dtype: float64

In [391]:
data.select_dtypes(include=['number']).corr()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
survived,1.0,-0.332658,-0.080678,-0.036589,0.070307,0.246769
pclass,-0.332658,1.0,-0.340414,0.088014,0.040296,-0.549216
age,-0.080678,-0.340414,1.0,-0.280794,-0.188442,0.089249
sibsp,-0.036589,0.088014,-0.280794,1.0,0.381433,0.135147
parch,0.070307,0.040296,-0.188442,0.381433,1.0,0.191942
fare,0.246769,-0.549216,0.089249,0.135147,0.191942,1.0


In [392]:
final_data=pd.get_dummies(data,drop_first=True)
final_data.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,sex_male,embarked_Q,embarked_S,class_Second,class_Third,who_man,who_woman,embark_town_Queenstown,embark_town_Southampton,alive_yes
0,0,3,22.0,1,0,7.25,True,False,True,False,True,False,True,True,False,False,True,False
1,1,1,38.0,1,0,71.2833,False,False,False,False,False,False,False,False,True,False,False,True
2,1,3,26.0,0,0,7.925,False,True,False,False,True,False,True,False,True,False,True,True
3,1,1,35.0,1,0,53.1,False,False,False,False,True,False,False,False,True,False,True,True
4,0,3,35.0,0,0,8.05,True,True,True,False,True,False,True,True,False,False,True,False


### Spliting X & Y

In [394]:
y=final_data['survived']
X=final_data.drop('survived',axis=1)

In [395]:
X.isna().sum()

pclass                     0
age                        0
sibsp                      0
parch                      0
fare                       0
adult_male                 0
alone                      0
sex_male                   0
embarked_Q                 0
embarked_S                 0
class_Second               0
class_Third                0
who_man                    0
who_woman                  0
embark_town_Queenstown     0
embark_town_Southampton    0
alive_yes                  0
dtype: int64

### Spliting test dataset and train dataset

In [397]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20,random_state=42,stratify=y) 

### Logisitic regression

In [399]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression(multi_class='ovr') #One-VS-Rest
classifier.fit(X_train,y_train) #command for training / fitting the model
y_pred=classifier.predict(X_test)
y_pred

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0], dtype=int64)

### Confusuion Matrix

In [401]:
#confusison matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[92,  0],
       [ 0, 65]], dtype=int64)

### Testing Accuracy

In [431]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)*100

100.0

###  Testing Accuracy  

In [435]:
y_pred_train=classifier.predict(X_train)
accuracy_score(y_train,y_pred_train)*100

100.0

In [404]:
#END