# Pandas Series

In [30]:
import numpy as np
import pandas as pd

In [31]:
file = 'titanic.csv'

In [32]:
#read csv file (tab separated)
df = pd.read_csv(file,sep='\t')

In [33]:
#DataFrame first 5 rows
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [34]:
#selecting the column 'Age'. The variable 'age' will be a Pandas.Series
age = df['Age']

In [35]:
#notice the type of the created variable
type(age)

pandas.core.series.Series

### Methods and Attributes of the Pandas Series

In [36]:
#getting the firs 5 rows 
age.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64

In [37]:
#getting the last 5 rows
age.tail()

151    22.0
152    55.5
153    40.5
154     NaN
155    51.0
Name: Age, dtype: float64

In [38]:
#get the number of rows
len(age)

156

In [39]:
#checking for the index
age.index

RangeIndex(start=0, stop=156, step=1)

In [40]:
#getting a statistics summary
age.describe()

count    126.000000
mean      28.141508
std       14.613880
min        0.830000
25%       19.000000
50%       26.000000
75%       35.000000
max       71.000000
Name: Age, dtype: float64

### Analyzing Numerical Series

In [41]:
#get the datatype 
age.dtype

dtype('float64')

In [42]:
#get the number of rows, NOT including missing values (NaN)
age.count()

126

In [43]:
#number of rows with missing values
len(age.loc[age.notna()==False])

30

In [44]:
#get the number of elements, including missing values
age.size

156

In [45]:
#get the number os rows, including missing values
len(age)

156

In [46]:
#calculate: sum method
age.sum()

3545.83

In [47]:
#calculating the standard deviation
age.std()

14.613879926560795

In [48]:
#getting the min value

In [49]:
age.min()

0.83

In [50]:
#getting the max value
age.max()

71.0

In [51]:
#getting the difference between the max value and the min value
age.ptp()

70.17

In [52]:
#getting the median of the values
age.median()

26.0

In [53]:
#return unique values of Series object (missing values are included)
age.unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 ])

In [54]:
#return the number of unique values (missing values not included)
age.nunique()

56

In [55]:
#if you want to include missing values in count, you can use the parameter 'dropna=False'
age.nunique(dropna=False)

57

In [56]:
#get the amount of each element of Series object
age.value_counts()

21.00    8
29.00    6
19.00    6
22.00    6
24.00    5
28.00    5
38.00    4
26.00    4
20.00    4
16.00    3
34.00    3
14.00    3
33.00    3
23.00    3
2.00     3
27.00    3
18.00    3
35.00    3
17.00    3
45.00    2
51.00    2
42.00    2
4.00     2
40.00    2
32.00    2
25.00    2
54.00    2
37.00    2
47.00    2
32.50    2
58.00    1
8.00     1
39.00    1
55.00    1
31.00    1
15.00    1
36.50    1
7.00     1
55.50    1
66.00    1
12.00    1
70.50    1
14.50    1
71.00    1
59.00    1
46.00    1
30.00    1
11.00    1
5.00     1
28.50    1
65.00    1
40.50    1
49.00    1
9.00     1
3.00     1
0.83     1
Name: Age, dtype: int64

In [57]:
#if you want to get the relative quantity, use the parameter 'normalize'
age.value_counts(normalize=True)

21.00    0.063492
29.00    0.047619
19.00    0.047619
22.00    0.047619
24.00    0.039683
28.00    0.039683
38.00    0.031746
26.00    0.031746
20.00    0.031746
16.00    0.023810
34.00    0.023810
14.00    0.023810
33.00    0.023810
23.00    0.023810
2.00     0.023810
27.00    0.023810
18.00    0.023810
35.00    0.023810
17.00    0.023810
45.00    0.015873
51.00    0.015873
42.00    0.015873
4.00     0.015873
40.00    0.015873
32.00    0.015873
25.00    0.015873
54.00    0.015873
37.00    0.015873
47.00    0.015873
32.50    0.015873
58.00    0.007937
8.00     0.007937
39.00    0.007937
55.00    0.007937
31.00    0.007937
15.00    0.007937
36.50    0.007937
7.00     0.007937
55.50    0.007937
66.00    0.007937
12.00    0.007937
70.50    0.007937
14.50    0.007937
71.00    0.007937
59.00    0.007937
46.00    0.007937
30.00    0.007937
11.00    0.007937
5.00     0.007937
28.50    0.007937
65.00    0.007937
40.50    0.007937
49.00    0.007937
9.00     0.007937
3.00     0.007937
0.83     0

In [58]:
#to get grouped data, you can use the parameter 'bins', setting the number of bins that you want.
age.value_counts(dropna=True,sort=True,ascending=False,bins=5)

(14.864, 28.898]    57
(28.898, 42.932]    34
(0.759, 14.864]     17
(42.932, 56.966]    12
(56.966, 71.0]       6
Name: Age, dtype: int64