# Benchmark Classification

## Importing and loading data

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline

print(pd.__version__)
print(np.__version__)

2.1.3
1.26.1


In [3]:
# Load the data
data = pd.read_csv('datasets/train.csv')

# Check the data
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Shape and columns of the data
print('Shape:', data.shape)
print('Columns:', data.columns)

Shape: (891, 12)
Columns: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [None]:
# Check for any NULL values
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Creating training and testing data

In [5]:
from sklearn.utils import shuffle

# Shuffling the Dataset
data = shuffle(data, random_state = 42)

# Creating 4 divisions
div = int(data.shape[0]/4)

# 3 parts to training set and 1 part to testing set
train = data.loc[:3*div+1,:]
test = data.loc[3*div+1:]

print(train.shape, test.shape)

(621, 12) (271, 12)


In [6]:
# Check the training data
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
709,710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C
439,440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S
840,841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,7.925,,S
720,721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0,,S
39,40,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,,C


In [7]:
# Check the testing data
test.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
667,668,0,3,"Rommetvedt, Mr. Knud Paust",male,,0,0,312993,7.775,,S
571,572,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2,0,11769,51.4792,C101,S
636,637,0,3,"Leinonen, Mr. Antti Gustaf",male,32.0,0,0,STON/O 2. 3101292,7.925,,S
714,715,0,2,"Greenberg, Mr. Samuel",male,52.0,0,0,250647,13.0,,S
262,263,0,1,"Taussig, Mr. Emil",male,52.0,1,1,110413,79.65,E67,S


## Accuracy based on mode on different features

### Survived

In [8]:
# Creating a new column called 'mode' for the mode from the training set
test['mode'] = train['Survived'].mode()[0]

# Check the data
test.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,mode
667,668,0,3,"Rommetvedt, Mr. Knud Paust",male,,0,0,312993,7.775,,S,0
571,572,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2,0,11769,51.4792,C101,S,0
636,637,0,3,"Leinonen, Mr. Antti Gustaf",male,32.0,0,0,STON/O 2. 3101292,7.925,,S,0
714,715,0,2,"Greenberg, Mr. Samuel",male,52.0,0,0,250647,13.0,,S,0
262,263,0,1,"Taussig, Mr. Emil",male,52.0,1,1,110413,79.65,E67,S,0


In [10]:
# Calculate the accuracy
accuracy_mode = accuracy_score(test['Survived'], test['mode'])
print('Accuracy:', accuracy_mode.round(2))

Accuracy: 0.63


### Gender

In [11]:
# Creating a frequency table based on the gender
gender_mode = pd.crosstab(train['Survived'], train['Sex'])
gender_mode

Sex,female,male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,57,321
1,167,76


In [13]:
test['gender_mode'] = test['Survived']

# For every unique value in column
for i in test['Sex'].unique():
  # Calculate and Assign mode to new column, corresponding to unique values in 'Sex'
  test['gender_mode'][test['Sex'] == str(i)] = train['Survived'][train['Sex'] == str(i)].mode()[0]

# Calculate the accuracy
accuracy_gender = accuracy_score(test['Survived'], test['gender_mode'])
print('Accuracy:', accuracy_gender.round(2))

Accuracy: 0.79
