Pandas
======

* fast, 
* powerful, 
* flexible 
* "easy to use" 
data analysis and manipulation tool

Tutorials
--------
* <https://pandas.pydata.org/docs/getting_started/index.html#getting-started>


In Pandas, a data table is called a DataFrame.

<img src="https://pandas.pydata.org/docs/_images/01_table_dataframe.svg"/>

In [2]:
import pandas as pd






In [10]:
# A basic data frame with several data
data = pd.DataFrame({
    "Name": ["Braund, Mr. Owen Harris",
             "Allen, Mr. William Henry",
             "Bonnell, Miss. Elizabeth"],
    "Age": [22, 35, 58],
    "Sex": ["male", "male", "female"]}
)

data

Unnamed: 0,Name,Age,Sex
0,"Braund, Mr. Owen Harris",22,male
1,"Allen, Mr. William Henry",35,male
2,"Bonnell, Miss. Elizabeth",58,female


In [8]:
# Extract an array
age = data["Age"]
print(age)
print(type(age))

0    22
1    35
2    58
Name: Age, dtype: int64
<class 'pandas.core.series.Series'>


In [11]:
age.max()

58

In [12]:
age.min()

22

In [15]:
data.describe()

Unnamed: 0,Age
count,3.0
mean,38.333333
std,18.230012
min,22.0
25%,28.5
50%,35.0
75%,46.5
max,58.0


In [38]:
# pclass => passenger class (1st, 2nd, 3rd), and is a proxy for socio-economic class. 
# survived => 1 yes
# age (fractional for babies)
# sibsp => ? (sibgling, spouse?)
# parch => ?
# embarked => port
# boat => id of boat where person was safe
titanic = pd.read_csv("datasets/titanic.csv")

In [17]:
titanic

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,?,C,?,328,?
1305,3,0,"Zabour, Miss. Thamine",female,?,1,0,2665,14.4542,?,C,?,?,?
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,?,C,?,304,?
1307,3,0,"Zakarian, Mr. Ortin",male,27,0,0,2670,7.225,?,C,?,?,?


In [18]:
titanic["age"]

0           29
1       0.9167
2            2
3           30
4           25
         ...  
1304      14.5
1305         ?
1306      26.5
1307        27
1308        29
Name: age, Length: 1309, dtype: object

In [24]:
# how each column is interpreted?
titanic.dtypes

pclass        int64
survived      int64
name         object
sex          object
age          object
sibsp         int64
parch         int64
ticket       object
fare         object
cabin        object
embarked     object
boat         object
body         object
home.dest    object
dtype: object

In [25]:
titanic.describe()

Unnamed: 0,pclass,survived,sibsp,parch
count,1309.0,1309.0,1309.0,1309.0
mean,2.294882,0.381971,0.498854,0.385027
std,0.837836,0.486055,1.041658,0.86556
min,1.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0
50%,3.0,0.0,0.0,0.0
75%,3.0,1.0,1.0,0.0
max,3.0,1.0,8.0,9.0


In [28]:
# It is indeed a DataFrame.
# There are 1309.
# Each row has a row label (aka the index) with values ranging from 0 to 1308.
# The table has 14 columns. All columns have a value for each of the rows
# column type => strings = object
# whole numbers
# The approximate amount of RAM used to hold the DataFrame is provided as well.

titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1309 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB


In [48]:
firstclass = titanic[ titanic["pclass"]==1 ]
firstclass

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,1,0,"Williams-Lambert, Mr. Fletcher Fellows",male,?,0,0,113510,35,C128,S,?,?,"London, England"
319,1,1,"Wilson, Miss. Helen Alice",female,31,0,0,16966,134.5,E39 E41,C,3,?,?
320,1,1,"Woolner, Mr. Hugh",male,?,0,0,19947,35.5,C52,S,D,?,"London, England"
321,1,0,"Wright, Mr. George",male,62,0,0,113807,26.55,?,S,?,?,"Halifax, NS"


In [52]:
firstclass.age.unique()

array(['29', '0.9167', '2', '30', '25', '48', '63', '39', '53', '71',
       '47', '18', '24', '26', '80', '?', '50', '32', '36', '37', '42',
       '19', '35', '28', '45', '40', '58', '22', '41', '44', '59', '60',
       '33', '17', '11', '14', '49', '76', '46', '27', '64', '55', '70',
       '38', '51', '31', '4', '54', '23', '43', '52', '16', '32.5', '21',
       '15', '65', '28.5', '45.5', '56', '13', '61', '34', '6', '57',
       '62', '67'], dtype=object)

In [56]:
firstclass = firstclass[ firstclass["age"]!="?" ]

In [58]:
firstclass.sort_values("age")

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
54,1,1,"Carter, Master. William Thornton II",male,11,1,2,113760,120,B96 B98,S,4,?,"Bryn Mawr, PA"
249,1,1,"Ryerson, Master. John Borie",male,13,2,2,PC 17608,262.375,B57 B59 B63 B66,C,4,?,"Haverford, PA / Cooperstown, NY"
55,1,1,"Carter, Miss. Lucile Polk",female,14,1,2,113760,120,B96 B98,S,4,?,"Bryn Mawr, PA"
193,1,1,"Madill, Miss. Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S,2,?,"St Louis, MO"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,1,0,"Crosby, Capt. Edward Gifford",male,70,1,1,WE/P 5735,71,B22,S,?,269,"Milwaukee, WI"
135,1,0,"Goldschmidt, Mr. George B",male,71,0,0,PC 17754,34.6542,A5,C,?,?,"New York, NY"
9,1,0,"Artagaveytia, Mr. Ramon",male,71,0,0,PC 17609,49.5042,?,C,?,22,"Montevideo, Uruguay"
61,1,1,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76,1,0,19877,78.85,C46,S,6,?,"Little Onn Hall, Staffs"


In [59]:
import pandas as pd

In [70]:
firstclass.sort_values("age").info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 284 entries, 1 to 14
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     284 non-null    int64 
 1   survived   284 non-null    int64 
 2   name       284 non-null    object
 3   sex        284 non-null    object
 4   age        284 non-null    object
 5   sibsp      284 non-null    int64 
 6   parch      284 non-null    int64 
 7   ticket     284 non-null    object
 8   fare       284 non-null    object
 9   cabin      284 non-null    object
 10  embarked   284 non-null    object
 11  boat       284 non-null    object
 12  body       284 non-null    object
 13  home.dest  284 non-null    object
dtypes: int64(4), object(10)
memory usage: 33.3+ KB


In [87]:
ageIx = pd.Index(['age'])
firstclass[ageIx] = firstclass[ageIx].apply(pd.to_numeric)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [91]:
numage = firstclass["age"].apply(pd.to_numeric)
firstclass["age"]=numage

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
