# Intro to Pandas
Andiamo a vedere come caricare le librerie di Pandas e Numpy, come creare serie di dati grazie alle funzioni di Numpy per poi inserirli in un DataFrame Pandas. 

Andiamo a vedere come slezionare e filtrare i dati all'interno di un DataFrame e come effettuare senplici operazioni con gli scalari.

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
series_obj = Series(np.arange(0,6), index =['row 1', 'row 2','row 3','row 4', 'row 5','row 6'])
series_obj

row 1    0
row 2    1
row 3    2
row 4    3
row 5    4
row 6    5
dtype: int32

In [3]:
series_obj['row 6']

5

In [4]:
series_obj[[0,5]]

row 1    0
row 6    5
dtype: int32

## Create a DataFrame

In [5]:
np.random.seed(25)
DF = DataFrame(np.random.rand(36).reshape(6,6), index=['row 1', 'row 2','row 3','row 4', 'row 5','row 6'], 
              columns=['column 1','column 2','column 3','column 4','column 5','column 6'])

In [6]:
DF

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 1,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
row 2,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
row 3,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
row 4,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
row 5,0.514244,0.559053,0.03445,0.71993,0.421004,0.436935
row 6,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


## Select Column & Rows

In [7]:
DF.loc[['row 2','row 5'],['column 3','column 1']]

Unnamed: 0,column 3,column 1
row 2,0.556229,0.684969
row 5,0.03445,0.514244


## Slice Rows & Columns

In [8]:
series_obj['row 2':'row 5']

row 2    1
row 3    2
row 4    3
row 5    4
dtype: int32

## Comparison with scalars

In [9]:
DF < .2

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 1,False,False,False,True,False,True
row 2,False,False,False,False,False,True
row 3,False,False,True,False,False,False
row 4,False,False,False,False,False,False
row 5,False,False,True,False,False,False
row 6,False,False,False,False,False,False


## Filtering with scalars


In [10]:
series_obj[series_obj<4]

row 1    0
row 2    1
row 3    2
row 4    3
dtype: int32

In [11]:
DF[DF>0.2]
# IT return NAN for the element that not satisfy the condition

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 1,0.870124,0.582277,0.278839,,0.4111,
row 2,0.684969,0.437611,0.556229,0.36708,0.402366,
row 3,0.447031,0.585445,,0.520719,0.326051,0.699186
row 4,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
row 5,0.514244,0.559053,,0.71993,0.421004,0.436935
row 6,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


## Setting data with scalars

In [12]:
series_obj['row 1', 'row 5']= 1000
series_obj

row 1    1000
row 2       1
row 3       2
row 4       3
row 5    1000
row 6       5
dtype: int32

# Missing values
I valori mancanti sono rappresentati dal valore"NaN" (Not a Number).
Ci sono vari modi per gestire i valori mancanti, uno di questi e quello di sostituirli con la media dei valori a nostra disposizione.

In [13]:
missing = np.nan
series_obj = Series(['row 1', 'row 2', missing,missing, 'row 5', 'row 6'])
series_obj

0    row 1
1    row 2
2      NaN
3      NaN
4    row 5
5    row 6
dtype: object

In [14]:
series_obj.isnull()

0    False
1    False
2     True
3     True
4    False
5    False
dtype: bool

In [15]:
series_obj[series_obj.isnull()]

2    NaN
3    NaN
dtype: object

## Filling missing values

In [16]:
np.random.seed(25)
DF = DataFrame(np.random.rand(36).reshape(6,6))
DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
3,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
4,0.514244,0.559053,0.03445,0.71993,0.421004,0.436935
5,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [17]:
DF.loc[3:5,0] = missing
DF.loc[1:4,5] = missing
DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [28]:
DF_filled = DF.fillna(0) # fillna() function fill the missing values with the passed parameter
DF_filled

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.0
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.0
3,0.0,0.836375,0.481343,0.516502,0.383048,0.0
4,0.0,0.559053,0.03445,0.71993,0.421004,0.0
5,0.0,0.900274,0.669612,0.456069,0.289804,0.525819


In [30]:
DF_filled = DF.fillna({0:0.1, 5:2}) # fill column 0 with value 0.1 and column with value 2
DF_filled

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,2.0
2,0.447031,0.585445,0.161985,0.520719,0.326051,2.0
3,0.1,0.836375,0.481343,0.516502,0.383048,2.0
4,0.1,0.559053,0.03445,0.71993,0.421004,2.0
5,0.1,0.900274,0.669612,0.456069,0.289804,0.525819


In [33]:
DF_filled = DF.fillna(method='ffill') # methon ffill "forward fill": fill the missing values with the last not NaN value
DF_filled

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.117376
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.117376
3,0.447031,0.836375,0.481343,0.516502,0.383048,0.117376
4,0.447031,0.559053,0.03445,0.71993,0.421004,0.117376
5,0.447031,0.900274,0.669612,0.456069,0.289804,0.525819


## Counting Missing Values


In [19]:
DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [20]:
DF.isnull().sum() # return the occurencies of missing values by columns

0    3
1    0
2    0
3    0
4    0
5    4
dtype: int64

## Filtering out missing values


In [21]:
DF_no_nan = DF.dropna() # delete all ROWS with at least one NaN
DF_no_nan


Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376


In [22]:
DF_no_nan = DF.dropna(axis=1) # delete all COLUMNS with at least one NaN
DF_no_nan


Unnamed: 0,1,2,3,4
0,0.582277,0.278839,0.185911,0.4111
1,0.437611,0.556229,0.36708,0.402366
2,0.585445,0.161985,0.520719,0.326051
3,0.836375,0.481343,0.516502,0.383048
4,0.559053,0.03445,0.71993,0.421004
5,0.900274,0.669612,0.456069,0.289804
