### Data Exploration with pandas 

In [10]:
# import required libraries
import pandas as pd
pd.set_option("display.max_rows", 13)   #to show at most 13 rows
pd.set_option("display.max_columns", 11)   #to show at most 11 columns 
pd.set_option("display.latex.repr", True)    #  is only to True set for the PDF version



In [None]:
# df.head() displays the first 5 lines of the DataFrame df
df = pd.read_csv('../data/titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,...,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,...,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,...,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,...,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,...,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,...,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,...,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,...,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,...,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,...,0,111369,30.0000,C148,C


In [6]:
# df.info() summarizes the content of the DataFrame df 

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [14]:
# df.describe() summarizes the numerical columns with some basic stats: count, min, max, mean,standard deviation etc.
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [22]:
# indexing:
# select  two rows
df.iloc[0:2]



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,...,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,...,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,...,0,PC 17599,71.2833,C85,C


In [23]:
# select  two columns
df[['Embarked', 'Ticket']]


Unnamed: 0,Embarked,Ticket
0,S,A/5 21171
1,C,PC 17599
2,S,STON/O2. 3101282
3,S,113803
4,S,373450
...,...,...
886,S,211536
887,S,112053
888,S,W./C. 6607
889,C,111369


In [None]:
# Select rows that fulfilled a certain conditions

# first way 
df[df['Age'] > 70]

# second way
df.query('Age> 70')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,...,Parch,Ticket,Fare,Cabin,Embarked
96,97,0,1,"Goldschmidt, Mr. George B",male,...,0,PC 17754,34.6542,A5,C
116,117,0,3,"Connors, Mr. Patrick",male,...,0,370369,7.75,,Q
493,494,0,1,"Artagaveytia, Mr. Ramon",male,...,0,PC 17609,49.5042,,C
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,...,0,27042,30.0,A23,S
851,852,0,3,"Svensson, Mr. Johan",male,...,0,347060,7.775,,S


In [30]:
# Select rows that fulfilled a certain conditions, with bitwise and (&) and bitwise or (|) operators

# first way 
df[(df['Age'] == 11) | (df['SibSp'] == 5)]

# second way
df.query('(Age == 11) | (SibSp == 5)')


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,...,Parch,Ticket,Fare,Cabin,Embarked
59,60,0,3,"Goodwin, Master. William Frederick",male,...,2,CA 2144,46.9,,S
71,72,0,3,"Goodwin, Miss. Lillian Amy",female,...,2,CA 2144,46.9,,S
386,387,0,3,"Goodwin, Master. Sidney Leonard",male,...,2,CA 2144,46.9,,S
480,481,0,3,"Goodwin, Master. Harold Victor",male,...,2,CA 2144,46.9,,S
542,543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,...,2,347082,31.275,,S
683,684,0,3,"Goodwin, Mr. Charles Edward",male,...,2,CA 2144,46.9,,S
731,732,0,3,"Hassan, Mr. Houssein G N",male,...,0,2699,18.7875,,C
802,803,1,1,"Carter, Master. William Thornton II",male,...,2,113760,120.0,B96 B98,S


In [32]:
# unique value of a column ( must be a series)
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [None]:
# Sorting 
df.sort_values('Age', ascending = False).head()