# Python Lab 04a: Introduction to Scikit-Learn for PCA and to Pandas

## Francesco Della Santa, Computational Linear Algebra for Large Scale Problems, Politecnico di Torino

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


## Initialize PCA objects

In [2]:
pca_general = PCA()
pca_perc = PCA(n_components=0.5, svd_solver='full')
pca_ncomp = PCA(n_components=7)

### And let's use them...

In [3]:
N, n = 1000, 100
S = np.random.rand(N, n)

# Start using the pca object
pca_ncomp.fit(S)
pca_perc.fit(S)

# Representation of S in the m-dim space of PCs
Qm_ncomp = pca_ncomp.transform(S)
Qm_perc = pca_perc.transform(S)

# Approximation of S obtained using m PCs
Stilde_ncomp = pca_ncomp.inverse_transform(Qm_ncomp)
Stilde_perc = pca_perc.inverse_transform(Qm_perc)

print('*********************** DATASET S ***********************')
display(S)
print('*********************************************************')
print('')
print(f'*********************** DATASET PROJECTED (ncomp: {pca_ncomp.n_components_} PCs) ***********************')
display(Qm_ncomp)
print('*********************************************************************************************************')
print('')
print(f'*********************** DATASET PROJECTED (perc: {pca_perc.n_components_} PCs) ************************')
display(Qm_perc)
print('********************************************************************************************************')
print('')
print('*********************** RECOVERED DATASET S (ncomp) ***********************')
display(Stilde_ncomp)
print('***************************************************************************')
print('')
print('*********************** RECOVERED DATASET S (perc) ***********************')
display(Stilde_perc)
print('**************************************************************************')

*********************** DATASET S ***********************


array([[0.85809441, 0.29277921, 0.81070075, ..., 0.04510227, 0.32611873,
        0.98986617],
       [0.4577833 , 0.35092916, 0.17744484, ..., 0.52462942, 0.4007693 ,
        0.66233825],
       [0.1370072 , 0.44838079, 0.66316967, ..., 0.66220168, 0.70011146,
        0.72228258],
       ...,
       [0.58699797, 0.62826389, 0.53965897, ..., 0.56241403, 0.87945153,
        0.60278457],
       [0.84057911, 0.42404184, 0.76727957, ..., 0.94723738, 0.67717277,
        0.48429779],
       [0.64681912, 0.03745471, 0.44755494, ..., 0.32658635, 0.68773124,
        0.68856412]])

*********************************************************

*********************** DATASET PROJECTED (ncomp: 7 PCs) ***********************


array([[ 2.65928171e-01, -1.44732815e-01,  1.51421480e-01, ...,
        -1.47888357e-01,  3.67791140e-01,  1.15005783e-01],
       [-3.16080841e-01,  2.61905737e-01,  2.09665780e-01, ...,
         4.09941873e-01, -2.11921054e-01,  4.54273164e-01],
       [-1.40319414e-01, -1.37542975e-01, -2.36040284e-02, ...,
         8.91027968e-02,  1.54659171e-01,  1.88480596e-01],
       ...,
       [-2.28227908e-02,  2.02522306e-01, -1.96661328e-01, ...,
         5.88406713e-01,  2.96838432e-01,  3.26354200e-01],
       [ 2.82223281e-01, -2.86465399e-01, -3.27209706e-01, ...,
        -1.35140540e-01,  2.55563305e-01,  6.43054311e-01],
       [-2.43505144e-01,  6.86293863e-01, -4.37565328e-01, ...,
        -1.04078307e-01,  1.51458165e-01, -4.12168711e-04]])

*********************************************************************************************************

*********************** DATASET PROJECTED (perc: 37 PCs) ************************


array([[-0.29534987, -0.00555092,  0.42203299, ...,  0.18828701,
         0.04097637,  0.11267839],
       [ 0.42782648, -0.07378971, -0.48807851, ..., -0.16272138,
        -0.06120081, -0.7659214 ],
       [ 0.13142342,  0.2947787 , -0.25991409, ..., -0.45108209,
        -0.24661012, -0.03847446],
       ...,
       [ 0.22908146, -0.29499499, -0.34452635, ..., -0.08030577,
        -0.38395264, -0.23687638],
       [-0.20206025,  0.005517  ,  0.46689423, ..., -0.11105439,
        -0.4995442 , -0.25985491],
       [ 0.38060879, -0.78699746, -0.14100274, ..., -0.04773391,
        -0.48220101,  0.02771857]])

********************************************************************************************************

*********************** RECOVERED DATASET S (ncomp) ***********************


array([[0.52770017, 0.54749451, 0.52527373, ..., 0.43617471, 0.55187303,
        0.55105437],
       [0.3114558 , 0.58718947, 0.49836008, ..., 0.45874231, 0.50936158,
        0.32741989],
       [0.46762463, 0.60808491, 0.48255789, ..., 0.48738703, 0.57826911,
        0.47680936],
       ...,
       [0.36618718, 0.5860258 , 0.46079741, ..., 0.52394766, 0.50856102,
        0.39980348],
       [0.49611655, 0.52155204, 0.55730602, ..., 0.45135847, 0.52728134,
        0.62187839],
       [0.44709873, 0.36665502, 0.41522536, ..., 0.66486064, 0.24630748,
        0.50097018]])

***************************************************************************

*********************** RECOVERED DATASET S (perc) ***********************


array([[0.43833797, 0.36458525, 0.71061696, ..., 0.32220209, 0.43089115,
        0.65188121],
       [0.27206581, 0.86908404, 0.45943224, ..., 0.67440166, 0.49563078,
        0.41550802],
       [0.43133976, 0.54460753, 0.6521841 , ..., 0.70940181, 0.61154457,
        0.33103925],
       ...,
       [0.52785227, 0.72782606, 0.62245748, ..., 0.66974301, 0.68913819,
        0.41768572],
       [0.59623051, 0.4405837 , 0.79692703, ..., 0.69115946, 0.64629092,
        0.43154473],
       [0.69143019, 0.25707426, 0.55050929, ..., 0.6605099 , 0.37527811,
        0.54059107]])

**************************************************************************


In [71]:
display(pca_ncomp.explained_variance_ratio_)

array([0.01708534, 0.01636685, 0.01618788, 0.01592719, 0.01532786,
       0.01516325, 0.01486827])

## Initialize the Standard Scaler

In [4]:
scaler_recent = StandardScaler(with_std=False)
scaler_znorm = StandardScaler()
# Start using the scaler objects
scaler_recent.fit(S)
scaler_znorm.fit(S)
# recentered S
Sbar = scaler_recent.transform(S)
# standardized S
Shat = scaler_znorm.transform(S)

print(f'*********************** DATASET RECENTERED ***********************')
print('SAMPLE MEAN OF RECENTERED DATA:')
display(Sbar.mean(axis=1)[:10])
print('SAMPLE ST.DEV. OF RECENTERED DATA:')
display(Sbar.std(axis=1)[:10])
print('')
display(Sbar)
print('*******************************************************************')
print('')
print(f'*********************** DATASET STANDARDIZED ***********************')
print('SAMPLE MEAN OF RECENTERED DATA:')
display(Shat.mean(axis=1)[:10])
print('SAMPLE ST.DEV. OF RECENTERED DATA:')
display(Shat.std(axis=1)[:10])
print('')
display(Shat)
print('**********************************************************************')

*********************** DATASET RECENTERED ***********************
SAMPLE MEAN OF RECENTERED DATA:


array([ 0.01170872,  0.05850343,  0.03111332, -0.03144899,  0.03364994,
        0.01851832,  0.00791555,  0.00904172, -0.04390442,  0.00235454])

SAMPLE ST.DEV. OF RECENTERED DATA:


array([0.27313582, 0.26954186, 0.28207625, 0.280501  , 0.2874484 ,
       0.3098937 , 0.28704849, 0.30015937, 0.29789275, 0.3061029 ])




array([[ 0.17578878,  0.18538562, -0.37824088, ...,  0.04130884,
        -0.10713979, -0.05208782],
       [ 0.23629115,  0.25762843, -0.30486113, ...,  0.39725662,
        -0.03989253,  0.17021194],
       [ 0.04304572,  0.37416166,  0.06797933, ...,  0.12664709,
         0.10338274,  0.19669295],
       ...,
       [-0.42478536, -0.10341936, -0.01094182, ..., -0.36948173,
        -0.18988633, -0.20020775],
       [-0.21853491,  0.05003855,  0.04172524, ...,  0.04493836,
         0.33216623,  0.30673197],
       [-0.00969485, -0.43361281,  0.50940657, ...,  0.36358156,
         0.15934043,  0.16939679]])

*******************************************************************

*********************** DATASET STANDARDIZED ***********************
SAMPLE MEAN OF RECENTERED DATA:


array([ 0.04087388,  0.20454822,  0.10733077, -0.10942995,  0.11687399,
        0.06877579,  0.02813741,  0.03281553, -0.15206011,  0.0089698 ])

SAMPLE ST.DEV. OF RECENTERED DATA:


array([0.94514087, 0.93375529, 0.97633256, 0.97138905, 0.9968786 ,
       1.07311918, 0.99681782, 1.04189091, 1.03187927, 1.06130374])




array([[ 0.60586493,  0.64415913, -1.31648637, ...,  0.14313773,
        -0.37645577, -0.18553281],
       [ 0.81438942,  0.89518114, -1.06108445, ...,  1.37651938,
        -0.1401699 ,  0.6062819 ],
       [ 0.14835926,  1.30009898,  0.23660548, ...,  0.43884019,
         0.36325467,  0.70060522],
       ...,
       [-1.46404425, -0.35935111, -0.03808354, ..., -1.28027762,
        -0.66720125, -0.71312465],
       [-0.75319164,  0.17386889,  0.14522678, ...,  0.15571426,
         1.16712836,  1.09255578],
       [-0.03341381, -1.50667377,  1.77301515, ...,  1.25983315,
         0.5598725 ,  0.60337839]])

**********************************************************************


### Let's apply the PCA to standardized data

In [5]:
pca = PCA(n_components=7)

# Start with PCA
pca.fit(Shat)
Qm = pca.transform(Shat)

# Recovering of Shat_tilde
Shat_tilde = pca.inverse_transform(Qm)

# Recovering of S_tilde
S_tilde = scaler_znorm.inverse_transform(Shat_tilde)

print('*********************** RECOVERED DATASET Shat ***********************')
display(Shat_tilde)
print('**********************************************************************')
print('*********************** RECOVERED DATASET S ***********************')
display(Shat)
print('*******************************************************************')

*********************** RECOVERED DATASET Shat ***********************


array([[ 0.58837274,  0.16164579, -0.02034769, ..., -0.05399437,
         0.00241911, -0.0496098 ],
       [ 0.08952442,  0.24194561,  0.19835543, ..., -0.04311201,
         0.1027376 , -0.1481024 ],
       [-0.24560123,  0.69939695,  0.04186872, ...,  0.25564159,
        -0.57438343,  0.04642639],
       ...,
       [-0.53294804, -0.1421036 , -0.18477067, ...,  0.18088698,
         0.1120286 , -0.03600711],
       [-0.18462964, -0.03666853, -0.09632127, ..., -0.08174284,
         0.46008021,  0.01640364],
       [ 0.70251408, -0.12067218,  0.5071987 , ..., -0.33795804,
         0.00198915,  0.45071665]])

**********************************************************************
*********************** RECOVERED DATASET S ***********************


array([[ 0.60586493,  0.64415913, -1.31648637, ...,  0.14313773,
        -0.37645577, -0.18553281],
       [ 0.81438942,  0.89518114, -1.06108445, ...,  1.37651938,
        -0.1401699 ,  0.6062819 ],
       [ 0.14835926,  1.30009898,  0.23660548, ...,  0.43884019,
         0.36325467,  0.70060522],
       ...,
       [-1.46404425, -0.35935111, -0.03808354, ..., -1.28027762,
        -0.66720125, -0.71312465],
       [-0.75319164,  0.17386889,  0.14522678, ...,  0.15571426,
         1.16712836,  1.09255578],
       [-0.03341381, -1.50667377,  1.77301515, ...,  1.25983315,
         0.5598725 ,  0.60337839]])

*******************************************************************


## Initialize $k$-Means

In [6]:
kmeans_default = KMeans()
kmeans_3c = KMeans(n_clusters=3, init='random', algorithm='full')

W0 = np.random.rand(3, 10)
kmeans_3cW0 = KMeans(n_clusters=3, init=W0, algorithm='full')

### And let's use one of them...

In [7]:
Snew = np.random.rand(N, n)

km = kmeans_3c  # change the KMeans object here if you want to try another one

# Start using the km object, fitting it on the data S
km.fit(S)

# Prediction of cluster belonging w.r.t. S
S_labels = km.labels_

# Prediction of cluster belonging w.r.t. Snew
Snew_labels = km.predict(Snew)

print(f'*********************** S labels ***********************')
display(S_labels[:10])
print('*********************************************************')
print('')
print(f'*********************** Snew labels ***********************')
display(Snew_labels[:10])
print('************************************************************')

*********************** S labels ***********************


array([1, 2, 1, 1, 1, 1, 2, 2, 1, 2], dtype=int32)

*********************************************************

*********************** Snew labels ***********************


array([1, 0, 0, 1, 1, 2, 0, 0, 2, 2], dtype=int32)

************************************************************


## Initialize Serieses

### Using Arrays

In [8]:
x = np.random.rand(10)
x

array([0.02832367, 0.02962356, 0.65625521, 0.35190408, 0.15745373,
       0.14302605, 0.65906786, 0.86213059, 0.96791573, 0.6271037 ])

In [9]:
s1 = pd.Series(x, index=[f'index{i}' for i in range(1,11)], name='my_series1')
s2 = pd.Series(x, name='my_series2')

In [10]:
s1 

index1     0.028324
index2     0.029624
index3     0.656255
index4     0.351904
index5     0.157454
index6     0.143026
index7     0.659068
index8     0.862131
index9     0.967916
index10    0.627104
Name: my_series1, dtype: float64

In [11]:
s2

0    0.028324
1    0.029624
2    0.656255
3    0.351904
4    0.157454
5    0.143026
6    0.659068
7    0.862131
8    0.967916
9    0.627104
Name: my_series2, dtype: float64

### Using Dictionaries

In [12]:
d = {'Age':30, 'Height':185, 'Weight':90}
d

{'Age': 30, 'Height': 185, 'Weight': 90}

In [13]:
s1d = pd.Series(d)

In [14]:
s1d

Age        30
Height    185
Weight     90
dtype: int64

## Initialize DataFrames

### Using Dictionaries

In [15]:
D = {'Float_random':np.random.rand(10), 'Integer_random':np.random.permutation(10)}
D

{'Float_random': array([0.78223019, 0.52686007, 0.85025065, 0.98668884, 0.02352652,
        0.81452545, 0.87772939, 0.73645107, 0.79337945, 0.3672442 ]),
 'Integer_random': array([6, 4, 9, 1, 5, 0, 3, 2, 7, 8])}

In [16]:
df1d = pd.DataFrame(D)
df1d

Unnamed: 0,Float_random,Integer_random
0,0.78223,6
1,0.52686,4
2,0.850251,9
3,0.986689,1
4,0.023527,5
5,0.814525,0
6,0.877729,3
7,0.736451,2
8,0.793379,7
9,0.367244,8


In [17]:
df1d.dtypes

Float_random      float64
Integer_random      int64
dtype: object

### Using Arrays

In [18]:
X = np.random.rand(10,5)
X

array([[0.14597851, 0.09045195, 0.5058304 , 0.05247141, 0.15437579],
       [0.84177635, 0.79980934, 0.67519508, 0.96845974, 0.58363265],
       [0.57264859, 0.18172515, 0.85280911, 0.78048991, 0.69369112],
       [0.3760878 , 0.10052236, 0.33150676, 0.81303607, 0.04413874],
       [0.1127637 , 0.39615579, 0.89685564, 0.44370458, 0.43898521],
       [0.90186196, 0.97418667, 0.91409096, 0.37177914, 0.05114453],
       [0.07066621, 0.9830648 , 0.55248426, 0.15238377, 0.9673845 ],
       [0.67495988, 0.05696272, 0.83345273, 0.22550469, 0.21454451],
       [0.99927346, 0.52365495, 0.31550193, 0.414654  , 0.33149865],
       [0.68426006, 0.55811697, 0.03125246, 0.01398424, 0.96576066]])

In [19]:
df1 = pd.DataFrame(X, index=range(1, X.shape[0] + 1), columns=[f'column_{i}' for i in range(1, X.shape[1] + 1)])
df1

Unnamed: 0,column_1,column_2,column_3,column_4,column_5
1,0.145979,0.090452,0.50583,0.052471,0.154376
2,0.841776,0.799809,0.675195,0.96846,0.583633
3,0.572649,0.181725,0.852809,0.78049,0.693691
4,0.376088,0.100522,0.331507,0.813036,0.044139
5,0.112764,0.396156,0.896856,0.443705,0.438985
6,0.901862,0.974187,0.914091,0.371779,0.051145
7,0.070666,0.983065,0.552484,0.152384,0.967384
8,0.67496,0.056963,0.833453,0.225505,0.214545
9,0.999273,0.523655,0.315502,0.414654,0.331499
10,0.68426,0.558117,0.031252,0.013984,0.965761


## Extract/Add Column

In [20]:
df1['column_2']

1     0.090452
2     0.799809
3     0.181725
4     0.100522
5     0.396156
6     0.974187
7     0.983065
8     0.056963
9     0.523655
10    0.558117
Name: column_2, dtype: float64

In [21]:
df1['column_6'] = np.random.rand(10)
df1

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,0.145979,0.090452,0.50583,0.052471,0.154376,0.531635
2,0.841776,0.799809,0.675195,0.96846,0.583633,0.99149
3,0.572649,0.181725,0.852809,0.78049,0.693691,0.468759
4,0.376088,0.100522,0.331507,0.813036,0.044139,0.925989
5,0.112764,0.396156,0.896856,0.443705,0.438985,0.981936
6,0.901862,0.974187,0.914091,0.371779,0.051145,0.335256
7,0.070666,0.983065,0.552484,0.152384,0.967384,0.678013
8,0.67496,0.056963,0.833453,0.225505,0.214545,0.620919
9,0.999273,0.523655,0.315502,0.414654,0.331499,0.653849
10,0.68426,0.558117,0.031252,0.013984,0.965761,0.359802


## DataFrame Attributes

We use the DataFrame df1 defined above.

In [22]:
df1.at[6, 'column_2']

0.9741866697242131

In [23]:
df1.iat[5, 1]

0.9741866697242131

In [24]:
df1.index

RangeIndex(start=1, stop=11, step=1)

In [25]:
df1.columns

Index(['column_1', 'column_2', 'column_3', 'column_4', 'column_5', 'column_6'], dtype='object')

In [26]:
df1.axes

[RangeIndex(start=1, stop=11, step=1),
 Index(['column_1', 'column_2', 'column_3', 'column_4', 'column_5', 'column_6'], dtype='object')]

In [27]:
df1.loc[[1,7,10], :]

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,0.145979,0.090452,0.50583,0.052471,0.154376,0.531635
7,0.070666,0.983065,0.552484,0.152384,0.967384,0.678013
10,0.68426,0.558117,0.031252,0.013984,0.965761,0.359802


In [28]:
df1.loc[[1,7,10], ['column_1', 'column_3']]

Unnamed: 0,column_1,column_3
1,0.145979,0.50583
7,0.070666,0.552484
10,0.68426,0.031252


In [29]:
df1.iloc[[0,6,9],[0,2]]

Unnamed: 0,column_1,column_3
1,0.145979,0.50583
7,0.070666,0.552484
10,0.68426,0.031252


In [30]:
df1.loc[(df1.index > 3) & (df1.index <= 7), df1.columns != 'column_3']

Unnamed: 0,column_1,column_2,column_4,column_5,column_6
4,0.376088,0.100522,0.813036,0.044139,0.925989
5,0.112764,0.396156,0.443705,0.438985,0.981936
6,0.901862,0.974187,0.371779,0.051145,0.335256
7,0.070666,0.983065,0.152384,0.967384,0.678013


In [31]:
df1.shape

(10, 6)

In [32]:
df1.ndim

2

In [33]:
df1.size

60

In [34]:
df1.values

array([[0.14597851, 0.09045195, 0.5058304 , 0.05247141, 0.15437579,
        0.5316346 ],
       [0.84177635, 0.79980934, 0.67519508, 0.96845974, 0.58363265,
        0.99148973],
       [0.57264859, 0.18172515, 0.85280911, 0.78048991, 0.69369112,
        0.46875892],
       [0.3760878 , 0.10052236, 0.33150676, 0.81303607, 0.04413874,
        0.92598933],
       [0.1127637 , 0.39615579, 0.89685564, 0.44370458, 0.43898521,
        0.98193589],
       [0.90186196, 0.97418667, 0.91409096, 0.37177914, 0.05114453,
        0.33525563],
       [0.07066621, 0.9830648 , 0.55248426, 0.15238377, 0.9673845 ,
        0.67801299],
       [0.67495988, 0.05696272, 0.83345273, 0.22550469, 0.21454451,
        0.62091897],
       [0.99927346, 0.52365495, 0.31550193, 0.414654  , 0.33149865,
        0.65384898],
       [0.68426006, 0.55811697, 0.03125246, 0.01398424, 0.96576066,
        0.35980155]])

## DataFrame Methods

We use the DataFrame df1 defined above.

### Exploration Methods

In [35]:
df1.head(3)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,0.145979,0.090452,0.50583,0.052471,0.154376,0.531635
2,0.841776,0.799809,0.675195,0.96846,0.583633,0.99149
3,0.572649,0.181725,0.852809,0.78049,0.693691,0.468759


In [36]:
df1.tail(2)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
9,0.999273,0.523655,0.315502,0.414654,0.331499,0.653849
10,0.68426,0.558117,0.031252,0.013984,0.965761,0.359802


In [37]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 1 to 10
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   column_1  10 non-null     float64
 1   column_2  10 non-null     float64
 2   column_3  10 non-null     float64
 3   column_4  10 non-null     float64
 4   column_5  10 non-null     float64
 5   column_6  10 non-null     float64
dtypes: float64(6)
memory usage: 612.0 bytes


In [38]:
df1.nunique()

column_1    10
column_2    10
column_3    10
column_4    10
column_5    10
column_6    10
dtype: int64

In [39]:
df1.nunique(axis=1)

1     6
2     6
3     6
4     6
5     6
6     6
7     6
8     6
9     6
10    6
dtype: int64

In [40]:
df1.isna()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False
10,False,False,False,False,False,False


In [41]:
df1.count()

column_1    10
column_2    10
column_3    10
column_4    10
column_5    10
column_6    10
dtype: int64

In [42]:
df1.value_counts()

column_1  column_2  column_3  column_4  column_5  column_6
0.070666  0.983065  0.552484  0.152384  0.967384  0.678013    1
0.112764  0.396156  0.896856  0.443705  0.438985  0.981936    1
0.145979  0.090452  0.505830  0.052471  0.154376  0.531635    1
0.376088  0.100522  0.331507  0.813036  0.044139  0.925989    1
0.572649  0.181725  0.852809  0.780490  0.693691  0.468759    1
0.674960  0.056963  0.833453  0.225505  0.214545  0.620919    1
0.684260  0.558117  0.031252  0.013984  0.965761  0.359802    1
0.841776  0.799809  0.675195  0.968460  0.583633  0.991490    1
0.901862  0.974187  0.914091  0.371779  0.051145  0.335256    1
0.999273  0.523655  0.315502  0.414654  0.331499  0.653849    1
dtype: int64

In [43]:
df1.value_counts(normalize=True)

column_1  column_2  column_3  column_4  column_5  column_6
0.070666  0.983065  0.552484  0.152384  0.967384  0.678013    0.1
0.112764  0.396156  0.896856  0.443705  0.438985  0.981936    0.1
0.145979  0.090452  0.505830  0.052471  0.154376  0.531635    0.1
0.376088  0.100522  0.331507  0.813036  0.044139  0.925989    0.1
0.572649  0.181725  0.852809  0.780490  0.693691  0.468759    0.1
0.674960  0.056963  0.833453  0.225505  0.214545  0.620919    0.1
0.684260  0.558117  0.031252  0.013984  0.965761  0.359802    0.1
0.841776  0.799809  0.675195  0.968460  0.583633  0.991490    0.1
0.901862  0.974187  0.914091  0.371779  0.051145  0.335256    0.1
0.999273  0.523655  0.315502  0.414654  0.331499  0.653849    0.1
dtype: float64

### Analisi Statistica di Base e Operazioni su Valori

In [44]:
df1.describe()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.538028,0.466465,0.590898,0.423647,0.444516,0.654765
std,0.342851,0.361549,0.297703,0.332899,0.348241,0.243704
min,0.070666,0.056963,0.031252,0.013984,0.044139,0.335256
25%,0.203506,0.120823,0.375088,0.170664,0.169418,0.484478
50%,0.623804,0.459905,0.61384,0.393217,0.385242,0.637384
75%,0.802397,0.739386,0.84797,0.696294,0.666177,0.863995
max,0.999273,0.983065,0.914091,0.96846,0.967384,0.99149


In [45]:
df1.describe(percentiles=[0.13, 0.87, 0.99])

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.538028,0.466465,0.590898,0.423647,0.444516,0.654765
std,0.342851,0.361549,0.297703,0.332899,0.348241,0.243704
min,0.070666,0.056963,0.031252,0.013984,0.044139,0.335256
13%,0.11841,0.092164,0.318223,0.069457,0.068694,0.378324
50%,0.623804,0.459905,0.61384,0.393217,0.385242,0.637384
87%,0.891647,0.944543,0.889368,0.807503,0.919509,0.972425
99%,0.990506,0.982266,0.91254,0.954472,0.967238,0.99063
max,0.999273,0.983065,0.914091,0.96846,0.967384,0.99149


In [46]:
df1.mean()

column_1    0.538028
column_2    0.466465
column_3    0.590898
column_4    0.423647
column_5    0.444516
column_6    0.654765
dtype: float64

In [47]:
df1.mean(axis=1)

1     0.246790
2     0.810060
3     0.591687
4     0.431880
5     0.545067
6     0.591386
7     0.567333
8     0.437724
9     0.539739
10    0.435529
dtype: float64

In [48]:
df1.corr()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
column_1,1.0,0.228543,-0.065525,0.26138,-0.151752,-0.284908
column_2,0.228543,1.0,0.037988,-0.044058,0.385912,-0.093552
column_3,-0.065525,0.037988,1.0,0.284137,-0.278395,0.11157
column_4,0.26138,-0.044058,0.284137,1.0,-0.164847,0.565316
column_5,-0.151752,0.385912,-0.278395,-0.164847,1.0,-0.107619
column_6,-0.284908,-0.093552,0.11157,0.565316,-0.107619,1.0


In [49]:
df1.cov()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
column_1,0.117547,0.02833,-0.006688,0.029833,-0.018118,-0.023805
column_2,0.02833,0.130718,0.004089,-0.005303,0.048589,-0.008243
column_3,-0.006688,0.004089,0.088627,0.028159,-0.028862,0.008095
column_4,0.029833,-0.005303,0.028159,0.110822,-0.01911,0.045863
column_5,-0.018118,0.048589,-0.028862,-0.01911,0.121272,-0.009133
column_6,-0.023805,-0.008243,0.008095,0.045863,-0.009133,0.059391


In [50]:
df1.sample(3, random_state=10)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
9,0.999273,0.523655,0.315502,0.414654,0.331499,0.653849
3,0.572649,0.181725,0.852809,0.78049,0.693691,0.468759
6,0.901862,0.974187,0.914091,0.371779,0.051145,0.335256


### Transformation Methods

In [51]:
df1_copy = df1.copy()
df1_fakecopy = df1

In [52]:
df1_fakecopy.at[1, 'column_1'] = 10

In [53]:
df1_copy.at[1, 'column_1'] = np.nan

In [54]:
df1_copy

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,,0.090452,0.50583,0.052471,0.154376,0.531635
2,0.841776,0.799809,0.675195,0.96846,0.583633,0.99149
3,0.572649,0.181725,0.852809,0.78049,0.693691,0.468759
4,0.376088,0.100522,0.331507,0.813036,0.044139,0.925989
5,0.112764,0.396156,0.896856,0.443705,0.438985,0.981936
6,0.901862,0.974187,0.914091,0.371779,0.051145,0.335256
7,0.070666,0.983065,0.552484,0.152384,0.967384,0.678013
8,0.67496,0.056963,0.833453,0.225505,0.214545,0.620919
9,0.999273,0.523655,0.315502,0.414654,0.331499,0.653849
10,0.68426,0.558117,0.031252,0.013984,0.965761,0.359802


In [55]:
df1

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,10.0,0.090452,0.50583,0.052471,0.154376,0.531635
2,0.841776,0.799809,0.675195,0.96846,0.583633,0.99149
3,0.572649,0.181725,0.852809,0.78049,0.693691,0.468759
4,0.376088,0.100522,0.331507,0.813036,0.044139,0.925989
5,0.112764,0.396156,0.896856,0.443705,0.438985,0.981936
6,0.901862,0.974187,0.914091,0.371779,0.051145,0.335256
7,0.070666,0.983065,0.552484,0.152384,0.967384,0.678013
8,0.67496,0.056963,0.833453,0.225505,0.214545,0.620919
9,0.999273,0.523655,0.315502,0.414654,0.331499,0.653849
10,0.68426,0.558117,0.031252,0.013984,0.965761,0.359802


(Notare che la modifica fatta a df1_fakecopy ha modificato anche df1!)

In [56]:
df1.append(df1_copy)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,10.0,0.090452,0.50583,0.052471,0.154376,0.531635
2,0.841776,0.799809,0.675195,0.96846,0.583633,0.99149
3,0.572649,0.181725,0.852809,0.78049,0.693691,0.468759
4,0.376088,0.100522,0.331507,0.813036,0.044139,0.925989
5,0.112764,0.396156,0.896856,0.443705,0.438985,0.981936
6,0.901862,0.974187,0.914091,0.371779,0.051145,0.335256
7,0.070666,0.983065,0.552484,0.152384,0.967384,0.678013
8,0.67496,0.056963,0.833453,0.225505,0.214545,0.620919
9,0.999273,0.523655,0.315502,0.414654,0.331499,0.653849
10,0.68426,0.558117,0.031252,0.013984,0.965761,0.359802


In [57]:
df1.drop([1, 3], axis=0)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
2,0.841776,0.799809,0.675195,0.96846,0.583633,0.99149
4,0.376088,0.100522,0.331507,0.813036,0.044139,0.925989
5,0.112764,0.396156,0.896856,0.443705,0.438985,0.981936
6,0.901862,0.974187,0.914091,0.371779,0.051145,0.335256
7,0.070666,0.983065,0.552484,0.152384,0.967384,0.678013
8,0.67496,0.056963,0.833453,0.225505,0.214545,0.620919
9,0.999273,0.523655,0.315502,0.414654,0.331499,0.653849
10,0.68426,0.558117,0.031252,0.013984,0.965761,0.359802


In [58]:
df1_copy.dropna()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
2,0.841776,0.799809,0.675195,0.96846,0.583633,0.99149
3,0.572649,0.181725,0.852809,0.78049,0.693691,0.468759
4,0.376088,0.100522,0.331507,0.813036,0.044139,0.925989
5,0.112764,0.396156,0.896856,0.443705,0.438985,0.981936
6,0.901862,0.974187,0.914091,0.371779,0.051145,0.335256
7,0.070666,0.983065,0.552484,0.152384,0.967384,0.678013
8,0.67496,0.056963,0.833453,0.225505,0.214545,0.620919
9,0.999273,0.523655,0.315502,0.414654,0.331499,0.653849
10,0.68426,0.558117,0.031252,0.013984,0.965761,0.359802


In [59]:
df1_copy.fillna(1000)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,1000.0,0.090452,0.50583,0.052471,0.154376,0.531635
2,0.841776,0.799809,0.675195,0.96846,0.583633,0.99149
3,0.572649,0.181725,0.852809,0.78049,0.693691,0.468759
4,0.376088,0.100522,0.331507,0.813036,0.044139,0.925989
5,0.112764,0.396156,0.896856,0.443705,0.438985,0.981936
6,0.901862,0.974187,0.914091,0.371779,0.051145,0.335256
7,0.070666,0.983065,0.552484,0.152384,0.967384,0.678013
8,0.67496,0.056963,0.833453,0.225505,0.214545,0.620919
9,0.999273,0.523655,0.315502,0.414654,0.331499,0.653849
10,0.68426,0.558117,0.031252,0.013984,0.965761,0.359802


In [60]:
df1_copy.rename(index={1:'nuovo_index'}, columns={'column_1':'COLONNA_1'})

Unnamed: 0,COLONNA_1,column_2,column_3,column_4,column_5,column_6
nuovo_index,,0.090452,0.50583,0.052471,0.154376,0.531635
2,0.841776,0.799809,0.675195,0.96846,0.583633,0.99149
3,0.572649,0.181725,0.852809,0.78049,0.693691,0.468759
4,0.376088,0.100522,0.331507,0.813036,0.044139,0.925989
5,0.112764,0.396156,0.896856,0.443705,0.438985,0.981936
6,0.901862,0.974187,0.914091,0.371779,0.051145,0.335256
7,0.070666,0.983065,0.552484,0.152384,0.967384,0.678013
8,0.67496,0.056963,0.833453,0.225505,0.214545,0.620919
9,0.999273,0.523655,0.315502,0.414654,0.331499,0.653849
10,0.68426,0.558117,0.031252,0.013984,0.965761,0.359802


In [61]:
df1_copy.reset_index()

Unnamed: 0,index,column_1,column_2,column_3,column_4,column_5,column_6
0,1,,0.090452,0.50583,0.052471,0.154376,0.531635
1,2,0.841776,0.799809,0.675195,0.96846,0.583633,0.99149
2,3,0.572649,0.181725,0.852809,0.78049,0.693691,0.468759
3,4,0.376088,0.100522,0.331507,0.813036,0.044139,0.925989
4,5,0.112764,0.396156,0.896856,0.443705,0.438985,0.981936
5,6,0.901862,0.974187,0.914091,0.371779,0.051145,0.335256
6,7,0.070666,0.983065,0.552484,0.152384,0.967384,0.678013
7,8,0.67496,0.056963,0.833453,0.225505,0.214545,0.620919
8,9,0.999273,0.523655,0.315502,0.414654,0.331499,0.653849
9,10,0.68426,0.558117,0.031252,0.013984,0.965761,0.359802


In [62]:
df1_copy.sort_values('column_1')

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
7,0.070666,0.983065,0.552484,0.152384,0.967384,0.678013
5,0.112764,0.396156,0.896856,0.443705,0.438985,0.981936
4,0.376088,0.100522,0.331507,0.813036,0.044139,0.925989
3,0.572649,0.181725,0.852809,0.78049,0.693691,0.468759
8,0.67496,0.056963,0.833453,0.225505,0.214545,0.620919
10,0.68426,0.558117,0.031252,0.013984,0.965761,0.359802
2,0.841776,0.799809,0.675195,0.96846,0.583633,0.99149
6,0.901862,0.974187,0.914091,0.371779,0.051145,0.335256
9,0.999273,0.523655,0.315502,0.414654,0.331499,0.653849
1,,0.090452,0.50583,0.052471,0.154376,0.531635


(Notare il NaN messo in fondo; per maggiori informazioni, guardare la documentazione ufficiale)

### Exportation Methods

In [63]:
df1.to_csv('df1.csv', columns=['column_1', 'column_5', 'column_2'], index_label='ID')

In [64]:
df1.to_pickle('df1.pkl')

## Loading a DataFrame

In [65]:
pd.read_csv('df1.csv')

Unnamed: 0,ID,column_1,column_5,column_2
0,1,10.0,0.154376,0.090452
1,2,0.841776,0.583633,0.799809
2,3,0.572649,0.693691,0.181725
3,4,0.376088,0.044139,0.100522
4,5,0.112764,0.438985,0.396156
5,6,0.901862,0.051145,0.974187
6,7,0.070666,0.967384,0.983065
7,8,0.67496,0.214545,0.056963
8,9,0.999273,0.331499,0.523655
9,10,0.68426,0.965761,0.558117


In [66]:
pd.read_csv('df1.csv', usecols=['ID', 'column_1', 'column_2'], index_col='ID')

Unnamed: 0_level_0,column_1,column_2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10.0,0.090452
2,0.841776,0.799809
3,0.572649,0.181725
4,0.376088,0.100522
5,0.112764,0.396156
6,0.901862,0.974187
7,0.070666,0.983065
8,0.67496,0.056963
9,0.999273,0.523655
10,0.68426,0.558117


In [67]:
pd.read_pickle('df1.pkl')

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,10.0,0.090452,0.50583,0.052471,0.154376,0.531635
2,0.841776,0.799809,0.675195,0.96846,0.583633,0.99149
3,0.572649,0.181725,0.852809,0.78049,0.693691,0.468759
4,0.376088,0.100522,0.331507,0.813036,0.044139,0.925989
5,0.112764,0.396156,0.896856,0.443705,0.438985,0.981936
6,0.901862,0.974187,0.914091,0.371779,0.051145,0.335256
7,0.070666,0.983065,0.552484,0.152384,0.967384,0.678013
8,0.67496,0.056963,0.833453,0.225505,0.214545,0.620919
9,0.999273,0.523655,0.315502,0.414654,0.331499,0.653849
10,0.68426,0.558117,0.031252,0.013984,0.965761,0.359802


## Concatenation of DataFrames

In [68]:
pd.concat([df1, df1.reset_index()], axis=1)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6,index,column_1.1,column_2.1,column_3.1,column_4.1,column_5.1,column_6.1
0,,,,,,,1.0,10.0,0.090452,0.50583,0.052471,0.154376,0.531635
1,10.0,0.090452,0.50583,0.052471,0.154376,0.531635,2.0,0.841776,0.799809,0.675195,0.96846,0.583633,0.99149
2,0.841776,0.799809,0.675195,0.96846,0.583633,0.99149,3.0,0.572649,0.181725,0.852809,0.78049,0.693691,0.468759
3,0.572649,0.181725,0.852809,0.78049,0.693691,0.468759,4.0,0.376088,0.100522,0.331507,0.813036,0.044139,0.925989
4,0.376088,0.100522,0.331507,0.813036,0.044139,0.925989,5.0,0.112764,0.396156,0.896856,0.443705,0.438985,0.981936
5,0.112764,0.396156,0.896856,0.443705,0.438985,0.981936,6.0,0.901862,0.974187,0.914091,0.371779,0.051145,0.335256
6,0.901862,0.974187,0.914091,0.371779,0.051145,0.335256,7.0,0.070666,0.983065,0.552484,0.152384,0.967384,0.678013
7,0.070666,0.983065,0.552484,0.152384,0.967384,0.678013,8.0,0.67496,0.056963,0.833453,0.225505,0.214545,0.620919
8,0.67496,0.056963,0.833453,0.225505,0.214545,0.620919,9.0,0.999273,0.523655,0.315502,0.414654,0.331499,0.653849
9,0.999273,0.523655,0.315502,0.414654,0.331499,0.653849,10.0,0.68426,0.558117,0.031252,0.013984,0.965761,0.359802
