Pandas
======

* fast, 
* powerful, 
* flexible 
* "easy to use" 
data analysis and manipulation tool

Tutorials
--------
* <https://pandas.pydata.org/docs/getting_started/index.html#getting-started>


In Pandas, a data table is called a DataFrame.

<img src="https://pandas.pydata.org/docs/_images/01_table_dataframe.svg"/>

In [2]:
# Pandas has to be imported. Conventionally it is imported as "pd".
import pandas as pd

## Create a Data Frame

In [7]:
# New Data Frame, column by column.
# =====================================================
# A basic data frame with several data
# Note that data are provided in column (or "series").
#
# This is like filling a spreadsheet column by column.
data = pd.DataFrame({
    
    # The column "name" as key and all the values in a list.
    "Name": ["Braund, Mr. Owen Harris",
             "Rossi, Sig.ra Miriana",
             "Bergerac, M.me Justine",
             "Wolf, Herr Gustav",
             "Consonni, Sig.ra Susanna",
             "Miser X"
            ],
    
    # The column "age" as key and all the values in a list.
    "Age": [22, 35, 58, 34, 56, 22],
    
    # The column "sex" as key and all the values in a list
    "Sex": ["male", "female", "female", "male", "female", "?"],
    
    "Citizenship": ["English", "Italian", "French", "German", "Italian", "?"]
    }
)

# evaluating something in the last line, 
# makes Jupyter to render it, provided it has been instructed how to do it.
data

Unnamed: 0,Name,Age,Sex,Citizenship
0,"Braund, Mr. Owen Harris",22,male,English
1,"Rossi, Sig.ra Miriana",35,female,Italian
2,"Bergerac, M.me Justine",58,female,French
3,"Wolf, Herr Gustav",34,male,German
4,"Consonni, Sig.ra Susanna",56,female,Italian
5,Miser X,22,?,?


In [3]:
# New Data Frame, row by row.
# =====================================================
# A basic data frame with several data
# Note that data can be also provided row by row.
# "headers" are provided with a separate argument.
data = pd.DataFrame([
    
        # 1st row, is a list
        ["Braund, Mr. Owen Harris", 22, "male", "English"],
    
        # 2nd row, is a list
        ["Rossi, Sig.ra Miriana", 35, "female", "Italian"],
    
        # 3rd row, is a list
        ["Wolf, Herr Gustav", 34, "male", "German"],
    
        # and so on...
        ["Bergerac, M.me Justine", 58, "female", "French"],
        ["Consonni, Sig.ra Susanna", 56, "female", "Italian"]
    ],
    columns=["Name", "Age", "Sex", "Citizenship"]
)

data

Unnamed: 0,Name,Age,Sex,Citizenship
0,"Braund, Mr. Owen Harris",22,male,English
1,"Rossi, Sig.ra Miriana",35,female,Italian
2,"Wolf, Herr Gustav",34,male,German
3,"Bergerac, M.me Justine",58,female,French
4,"Consonni, Sig.ra Susanna",56,female,Italian


## Inspect a Data Frame

In [11]:
# Extract what we would call a "column". It is called a "series" in Pandas jargoon.
# A "series" / "column" is extracted with a dictionary semantic.
age = data["Age"]

print(age)
print()
print(type(age))

0    22
1    35
2    58
3    34
4    56
5    22
Name: Age, dtype: int64

<class 'pandas.core.series.Series'>


In [34]:
# some info on the series.
print( age.max() )
print( age.min() )
print( age.mean() )
print( age.median() )
print( age.std() )
print( age.idxmax() )

# what else you can do?
# check out https://pandas.pydata.org/pandas-docs/stable/reference/series.html

58
22
41.0
35.0
15.491933384829668
3


In [35]:
# description of the series.
data.describe()

Unnamed: 0,Age
count,5.0
mean,41.0
std,15.491933
min,22.0
25%,34.0
50%,35.0
75%,56.0
max,58.0


## Rimozione

### Drop

In [21]:
# Please note that a data frame has a row identifier column associated with values associated for each row.
tmp = pd.DataFrame([ 
    [1, 2, 3, "Hello"], 
    [10, 20, 30, "Data"], 
    [100, 200, 300, "Science"] 
], 
    columns=["a", "b", "c", "d"]) 

tmp

Unnamed: 0,a,b,c,d
0,1,2,3,Hello
1,10,20,30,Data
2,100,200,300,Science


In [22]:
# Drop row with index = 1
tmp.drop([1])

Unnamed: 0,a,b,c,d
0,1,2,3,Hello
2,100,200,300,Science


In [23]:
tmp

Unnamed: 0,a,b,c,d
0,1,2,3,Hello
1,10,20,30,Data
2,100,200,300,Science


In [24]:
# Drop row with index = 0,2
tmp.drop([0, 2])

Unnamed: 0,a,b,c,d
1,10,20,30,Data


In [25]:
# Drop rows with specific values in a given series / colum
tmp.drop(["b", "c"], axis=1)

Unnamed: 0,a,d
0,1,Hello
1,10,Data
2,100,Science


In [26]:
tmp.drop(["b", "c"], axis='columns')

Unnamed: 0,a,d
0,1,Hello
1,10,Data
2,100,Science


### Dropna

In [43]:
# a dataframe with some missing values
tmp = pd.DataFrame([ 
    [1, 2, 3, None], 
    [10, 20, 30, "Data"], 
    [100, None, 300, "Science"] 
], 
    columns=["a", "b", "c", "d"]) 
tmp


Unnamed: 0,a,b,c,d
0,1,2.0,3,
1,10,20.0,30,Data
2,100,,300,Science


In [14]:
# Remove rows with missing values, None, NaN and so on.
tmp.dropna()

Unnamed: 0,a,b,c,d
1,10,20.0,30,Data


## Selection

In [5]:
data

Unnamed: 0,Name,Age,Sex,Citizenship
0,"Braund, Mr. Owen Harris",22,male,English
1,"Rossi, Sig.ra Miriana",35,female,Italian
2,"Wolf, Herr Gustav",34,male,German
3,"Bergerac, M.me Justine",58,female,French
4,"Consonni, Sig.ra Susanna",56,female,Italian


In [6]:
# 1st row
sel = data.iloc[ 0 ]
print( "row at index 0" )
print( sel )
print( type(sel) )
print( )

# 2nd row
print( "row at index 1" )
print(data.iloc[ 1 ])
print( )

# 3rd row
print( "row at index 2" )
print(data.iloc[ 2 ])


row at index 0
Name           Braund, Mr. Owen Harris
Age                                 22
Sex                               male
Citizenship                    English
Name: 0, dtype: object
<class 'pandas.core.series.Series'>

row at index 1
Name           Rossi, Sig.ra Miriana
Age                               35
Sex                           female
Citizenship                  Italian
Name: 1, dtype: object

row at index 2
Name           Wolf, Herr Gustav
Age                           34
Sex                         male
Citizenship               German
Name: 2, dtype: object


In [11]:
# last row
print(data.shape)
print()

print("all in one row")
print(data.iloc[data.shape[0]-1])

print("all expanded")
dimensions = data.shape
number_of_rows = dimensions[0]
last_index = number_of_rows - 1
last_row = data.iloc[ last_index ]
print( last_row )

print("but there's a easier way")
print(data.iloc[-1])


(5, 4)

all in one row
Name           Consonni, Sig.ra Susanna
Age                                  56
Sex                              female
Citizenship                     Italian
Name: 4, dtype: object
all expanded
Name           Consonni, Sig.ra Susanna
Age                                  56
Sex                              female
Citizenship                     Italian
Name: 4, dtype: object
but there's a easier way
Name           Consonni, Sig.ra Susanna
Age                                  56
Sex                              female
Citizenship                     Italian
Name: 4, dtype: object


In [12]:
# rows with indexes in interval [1, 3)
print(data.iloc[1:3])


                    Name  Age     Sex Citizenship
1  Rossi, Sig.ra Miriana   35  female     Italian
2      Wolf, Herr Gustav   34    male      German


In [15]:
# rows with indexes specified in an array
print(
    data.iloc[
        [0, 2, 4]
    ]
)

                       Name  Age     Sex Citizenship
0   Braund, Mr. Owen Harris   22    male     English
2         Wolf, Herr Gustav   34    male      German
4  Consonni, Sig.ra Susanna   56  female     Italian


In [16]:
data

Unnamed: 0,Name,Age,Sex,Citizenship
0,"Braund, Mr. Owen Harris",22,male,English
1,"Rossi, Sig.ra Miriana",35,female,Italian
2,"Wolf, Herr Gustav",34,male,German
3,"Bergerac, M.me Justine",58,female,French
4,"Consonni, Sig.ra Susanna",56,female,Italian


In [18]:
# select row and column
data.iloc[ 2, 0 ]

'Wolf, Herr Gustav'

In [19]:
# select row and column
data.iloc[ 2, 3 ]

'German'

In [20]:
# select row and column
data.iloc[ 2, -1 ]

'German'