# Pandas

**pandas** is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.

## SERIES
Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.). The axis labels are collectively referred to as the index.

In [165]:
import pandas as pd

In [166]:
simple_list = ["one", "two", "three"]

In [167]:
pd.Series(simple_list)
pd.Series(data=simple_list)

0      one
1      two
2    three
dtype: object

In [168]:
simple_list_integers = [100,200,300]
pd.Series(simple_list_integers)

0    100
1    200
2    300
dtype: int64

In [169]:
simple_list_of_mix_values = [1, True, "Hello", None, [0, 9, 8], {"name": "Pandas"}]

In [170]:
pd.Series(simple_list_of_mix_values)

0                       1
1                    True
2                   Hello
3                    None
4               [0, 9, 8]
5    {u'name': u'Pandas'}
dtype: object

# Create Series From Python Dictionary

In [171]:
simple_dict = {"first_name": "Steve", 
               "last_name": "Flurry",
               "age": 17,
               "age": 22}
simple_dict

{'age': 22, 'first_name': 'Steve', 'last_name': 'Flurry'}

In [172]:
pd.Series(simple_dict)

age               22
first_name     Steve
last_name     Flurry
dtype: object

### Series Attributes

In [173]:
#print simple_list
s = pd.Series(simple_list)
s.head(1)
s.tail()

0      one
1      two
2    three
dtype: object

In [174]:
s

0      one
1      two
2    three
dtype: object

In [175]:
print dir(s)

['T', '_AXIS_ALIASES', '_AXIS_IALIASES', '_AXIS_LEN', '_AXIS_NAMES', '_AXIS_NUMBERS', '_AXIS_ORDERS', '_AXIS_REVERSED', '_AXIS_SLICEMAP', '__abs__', '__add__', '__and__', '__array__', '__array_prepare__', '__array_priority__', '__array_wrap__', '__bool__', '__bytes__', '__class__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__div__', '__divmod__', '__doc__', '__eq__', '__finalize__', '__float__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__idiv__', '__imul__', '__init__', '__int__', '__invert__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__le__', '__len__', '__long__', '__lt__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pow__', '__radd__', '__rand__', '__rdiv__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmod__', '__rmul__', '__ror__', '__round__', '

In [176]:
s.values

array(['one', 'two', 'three'], dtype=object)

In [177]:
s.index

RangeIndex(start=0, stop=3, step=1)

In [178]:
s.dtype

dtype('O')

Most common Data Types:
    
    dtype('O') --> Object
    dtype('int64') --> Integer
    dtype('float64') --> Float
    dtype('bool') --> Bool
    

### Series Methods

In [179]:
s = pd.Series([1.21, 1.90, 1.55])
s

0    1.21
1    1.90
2    1.55
dtype: float64

In [180]:
s.sum()

4.6600000000000001

In [181]:
s.product()

3.56345

In [182]:
s.mean()

1.5533333333333335

Create new Series giving a custom **INDEX** list.

In [183]:
colors_list = ["red", "blue", "green", "yellow"]
numbers = ["one", "two", "three", "four"]

# pd.Series(colors_list, numbers) 
# pd.Series(colors_list, index=numbers) 
pd.Series(data=colors_list, index=numbers)

one         red
two        blue
three     green
four     yellow
dtype: object

We can duplicate **INDEXES**.

In [184]:
colors_list = ["red", "blue", "green", "yellow"]
numbers = ["one", "one", "one", "one"]

# pd.Series(colors_list, numbers) 
# pd.Series(colors_list, index=numbers) 
pd.Series(data=colors_list, index=numbers)

one       red
one      blue
one     green
one    yellow
dtype: object

### Series - Other Attributes

In [185]:
s = pd.Series(colors_list)
s

0       red
1      blue
2     green
3    yellow
dtype: object

In [186]:
s.is_unique

True

In [187]:
pd.Series([1,1]).is_unique
#pd.Series([1]).is_unique

False

In [188]:
# Series Dimension
s.shape

(4,)

In [189]:
print s.name
s = pd.Series(colors_list, name="Hello")
print s.name
print s

None
Hello
0       red
1      blue
2     green
3    yellow
Name: Hello, dtype: object


### sort_values

Order by Values

In [190]:
s.sort_values()

1      blue
2     green
0       red
3    yellow
Name: Hello, dtype: object

In [191]:
print s.sort_values(ascending=False)
print s

3    yellow
0       red
2     green
1      blue
Name: Hello, dtype: object
0       red
1      blue
2     green
3    yellow
Name: Hello, dtype: object


### inplace

Used to override the original object

In [192]:
print s.sort_values(ascending=False, inplace=True)
print s

None
3    yellow
0       red
2     green
1      blue
Name: Hello, dtype: object


### sort_index

Order by Index

In [193]:
s.sort_index()

0       red
1      blue
2     green
3    yellow
Name: Hello, dtype: object

In [194]:
s.sort_index(ascending=False)

3    yellow
2     green
1      blue
0       red
Name: Hello, dtype: object

In [195]:
s.sort_index(inplace=True)
s

0       red
1      blue
2     green
3    yellow
Name: Hello, dtype: object

### IN Keyword

In [196]:
s = pd.Series(colors_list)
'red' in s.values

True

In [197]:
0 in s
0 in s.index

True

### Get Values By Index

In [198]:
s[0]

'red'

In [199]:
print s.get(0)
s.get(23424234, default='NON ESISTE')

red


'NON ESISTE'

In [200]:
s.get_value(0)

'red'

In [201]:
# Get by multiple indexes
s[ [0,1,2] ]

0      red
1     blue
2    green
dtype: object

In [202]:
s[[0,2]]

0      red
2    green
dtype: object

In [203]:
s[0:3]

0      red
1     blue
2    green
dtype: object

In [204]:
s.get([0,2])

0      red
2    green
dtype: object

In [205]:
print s.get(199)
# s[199]

None


In [206]:
s.get(199, default="Not Found")

'Not Found'

In [207]:
ciao = s.get([1,100], default="You can't see me")
print ciao
s

1      blue
100     NaN
dtype: object


0       red
1      blue
2     green
3    yellow
dtype: object

### Math Methods

In [208]:
s = pd.Series([1.21, 1.90, 1.55, 1.98, 4.4, 8.54, 1.21])
s

0    1.21
1    1.90
2    1.55
3    1.98
4    4.40
5    8.54
6    1.21
dtype: float64

In [209]:
s.count()

7

In [210]:
len(s)

7

In [211]:
print s.sum()
print s.mean()
print s.product()
print s.max()
print s.median()
print s.mode()
print '---'
print s.describe()

20.79
2.97
320.798092452
8.54
1.9
0    1.21
dtype: float64
---
count    7.000000
mean     2.970000
std      2.689684
min      1.210000
25%      1.380000
50%      1.900000
75%      3.190000
max      8.540000
dtype: float64


### idxmax & idxmin

In [212]:
# Get value at index MAX
print s
index = s.idxmax()
print index, s.get(index)

0    1.21
1    1.90
2    1.55
3    1.98
4    4.40
5    8.54
6    1.21
dtype: float64
5 8.54


In [213]:
# Get value at index MIN
index = s.idxmin()
print index, s.get(index)

0 1.21


### value_counts

In [214]:
s = pd.Series(data=["Alex", "Pippo", "Vale", "Alex", "Hello"])

In [215]:
# Get occurrences for each value
s.value_counts()

Alex     2
Vale     1
Hello    1
Pippo    1
dtype: int64

In [216]:
type(s.value_counts().sum())

numpy.int64

In [217]:
s.count() == s.value_counts().sum()

True

In [218]:
s.value_counts(ascending=True)

Pippo    1
Hello    1
Vale     1
Alex     2
dtype: int64

### Apply

Apply custom function on each element of the Serie

In [219]:
s = pd.Series([1.21, 1.90, 1.55, 1.98, 4.4, 8.54, 1.21])
s

0    1.21
1    1.90
2    1.55
3    1.98
4    4.40
5    8.54
6    1.21
dtype: float64

In [220]:
def get_something(number):
    if number <4:
        return 'OOOOH!!'
    return 'AAAAAHHHH!'

In [221]:
s.apply(get_something)

0       OOOOH!!
1       OOOOH!!
2       OOOOH!!
3       OOOOH!!
4    AAAAAHHHH!
5    AAAAAHHHH!
6       OOOOH!!
dtype: object

In [222]:
s.apply(lambda x: "{}$".format(x))

0    1.21$
1     1.9$
2    1.55$
3    1.98$
4     4.4$
5    8.54$
6    1.21$
dtype: object

### map

Map for each element 

In [223]:
s = pd.Series(data=["Alex", "Pippo", "Vale", "Hello"])
s

0     Alex
1    Pippo
2     Vale
3    Hello
dtype: object

In [224]:
s2 = pd.Series(data=["Comu", "Pippini", "Gela", "World"], 
               index=["Alex", "Pippo", "Vale", "Hello"])
s2

Alex        Comu
Pippo    Pippini
Vale        Gela
Hello      World
dtype: object

In [225]:
s.map(s2)

0       Comu
1    Pippini
2       Gela
3      World
dtype: object

In [226]:
s2_dict = s2.to_dict()

In [227]:
s2_dict

{'Alex': 'Comu', 'Hello': 'World', 'Pippo': 'Pippini', 'Vale': 'Gela'}

In [228]:
s.map(s2_dict)

0       Comu
1    Pippini
2       Gela
3      World
dtype: object

## Load data from CSV

In [229]:
s = pd.read_csv('dataset/speedtest.csv', 
                usecols=['CLIENT_CITY'],
                squeeze=True)

In [230]:
print type(s)

<class 'pandas.core.series.Series'>


In [231]:
s

0          Bardonecchia
1                 Turin
2          Bardonecchia
3          Bardonecchia
4          Bardonecchia
5          Bardonecchia
6          Bardonecchia
7                 Turin
8              Aramengo
9                 Turin
10                Turin
11                Turin
12                Turin
13                Turin
14              Imperia
15                Turin
16              Imperia
17               Almese
18               Almese
19                Turin
20              Bibiana
21         Bardonecchia
22         Bardonecchia
23                Turin
24                 Asti
25                Turin
26                Turin
27                Turin
28                 Asti
29                Turin
              ...      
199114            Turin
199115            Turin
199116    Bene Vagienna
199117            Turin
199118            Turin
199119            Turin
199120            Turin
199121            Turin
199122            Turin
199123            Turin
199124          

In [232]:
s.index

RangeIndex(start=0, stop=199144, step=1)

In [233]:
s.tail()

199139    Pettenasco
199140      Florence
199141         Turin
199142         Turin
199143         Turin
Name: CLIENT_CITY, dtype: object

In [234]:
list(s)

['Bardonecchia',
 'Turin',
 'Bardonecchia',
 'Bardonecchia',
 'Bardonecchia',
 'Bardonecchia',
 'Bardonecchia',
 'Turin',
 'Aramengo',
 'Turin',
 'Turin',
 'Turin',
 'Turin',
 'Turin',
 'Imperia',
 'Turin',
 'Imperia',
 'Almese',
 'Almese',
 'Turin',
 'Bibiana',
 'Bardonecchia',
 'Bardonecchia',
 'Turin',
 'Asti',
 'Turin',
 'Turin',
 'Turin',
 'Asti',
 'Turin',
 'Turin',
 'Turin',
 'Turin',
 'Varese',
 'Aosta',
 'Asti',
 nan,
 'Alessandria',
 'Turin',
 'Turin',
 'Giarole',
 'Turin',
 'Turin',
 'Turin',
 'Turin',
 'Turin',
 'Casale Monferrato',
 'Casale Monferrato',
 'Turin',
 'Turin',
 'Rubiana',
 'Rubiana',
 'Turin',
 'Turin',
 nan,
 'Turin',
 'Turin',
 'Turin',
 'Asti',
 'Asti',
 'Turin',
 'Casale Monferrato',
 'Airasca',
 'Casale Monferrato',
 nan,
 'Casale Monferrato',
 'Turin',
 'Casale Monferrato',
 'Turin',
 'Valpelline',
 'Casale Monferrato',
 'Turin',
 'Turin',
 'Turin',
 'Airasca',
 'Pessinetto',
 'Turin',
 'Alessandria',
 'Alessandria',
 'Pessinetto',
 'Turin',
 nan,
 'Turi

In [235]:
dict(s)

{0: 'Bardonecchia',
 1: 'Turin',
 2: 'Bardonecchia',
 3: 'Bardonecchia',
 4: 'Bardonecchia',
 5: 'Bardonecchia',
 6: 'Bardonecchia',
 7: 'Turin',
 8: 'Aramengo',
 9: 'Turin',
 10: 'Turin',
 11: 'Turin',
 12: 'Turin',
 13: 'Turin',
 14: 'Imperia',
 15: 'Turin',
 16: 'Imperia',
 17: 'Almese',
 18: 'Almese',
 19: 'Turin',
 20: 'Bibiana',
 21: 'Bardonecchia',
 22: 'Bardonecchia',
 23: 'Turin',
 24: 'Asti',
 25: 'Turin',
 26: 'Turin',
 27: 'Turin',
 28: 'Asti',
 29: 'Turin',
 30: 'Turin',
 31: 'Turin',
 32: 'Turin',
 33: 'Varese',
 34: 'Aosta',
 35: 'Asti',
 36: nan,
 37: 'Alessandria',
 38: 'Turin',
 39: 'Turin',
 40: 'Giarole',
 41: 'Turin',
 42: 'Turin',
 43: 'Turin',
 44: 'Turin',
 45: 'Turin',
 46: 'Casale Monferrato',
 47: 'Casale Monferrato',
 48: 'Turin',
 49: 'Turin',
 50: 'Rubiana',
 51: 'Rubiana',
 52: 'Turin',
 53: 'Turin',
 54: nan,
 55: 'Turin',
 56: 'Turin',
 57: 'Turin',
 58: 'Asti',
 59: 'Asti',
 60: 'Turin',
 61: 'Casale Monferrato',
 62: 'Airasca',
 63: 'Casale Monferrato

In [236]:
s.value_counts()

Turin                       121227
Rubiana                       3115
Rivoli                        2752
Almese                        2497
Asti                          2398
Novara                        2099
Milan                         1699
Cuneo                         1666
Grugliasco                    1285
Alessandria                   1260
Genova                        1225
Arignano                      1065
Collegno                      1017
Nichelino                      972
Bruino                         918
Cocconato                      887
Moncalieri                     865
Caluso                         846
Andezeno                       787
Biella                         779
Marentino                      736
Rome                           722
Fossano                        681
Bardonecchia                   668
Alba                           599
Riva Presso Chieri             564
Avigliana                      558
Vercelli                       547
Aramengo            

In [237]:
sorted([1,5,2,6,])
sorted(pd.Series([1,5,2,6]))

[1, 2, 5, 6]

In [238]:
# Since there are NULL Values we can not apply a SORT!
sorted(s)

[nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan

In [239]:
#s.fillna('AAAA', inplace=True)
s.dropna(inplace=True)
s

0          Bardonecchia
1                 Turin
2          Bardonecchia
3          Bardonecchia
4          Bardonecchia
5          Bardonecchia
6          Bardonecchia
7                 Turin
8              Aramengo
9                 Turin
10                Turin
11                Turin
12                Turin
13                Turin
14              Imperia
15                Turin
16              Imperia
17               Almese
18               Almese
19                Turin
20              Bibiana
21         Bardonecchia
22         Bardonecchia
23                Turin
24                 Asti
25                Turin
26                Turin
27                Turin
28                 Asti
29                Turin
              ...      
199112            Turin
199113            Turin
199114            Turin
199115            Turin
199116    Bene Vagienna
199117            Turin
199118            Turin
199119            Turin
199120            Turin
199121            Turin
199122          

In [240]:
sorted(s)

['Abakan',
 'Abano Terme',
 'Abano Terme',
 'Abbadia Lariana',
 'Abbadia San Salvatore',
 'Abbiategrasso',
 'Abbiategrasso',
 'Abbiategrasso',
 'Abetone',
 'Acerra',
 'Acerra',
 'Acerra',
 'Acerra',
 'Acerra',
 'Acquafredda',
 'Acqualagna',
 'Acquanegra Sul Chiese',
 'Acquanegra Sul Chiese',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'Acqui Terme',
 'A

In [241]:
s = pd.read_csv('dataset/users.csv',
                index_col='NAME',
                squeeze=True)

In [242]:
s.head()

NAME
Abbondanzio    1
Abdone         2
Abele          3
Abibo          4
Abramo         5
Name: ID, dtype: int64

In [243]:
s['Abramo']

5

In [244]:
s['Abbondanzio':'Abele']

NAME
Abbondanzio    1
Abdone         2
Abele          3
Name: ID, dtype: int64

In [245]:
s[['Carlo', 'Not Found']]

NAME
Carlo        165.0
Not Found      NaN
Name: ID, dtype: float64

In [246]:
s = pd.read_csv('dataset/users.csv',
                index_col='ID',
                squeeze=True)

In [247]:
s.head()

ID
1    Abbondanzio
2         Abdone
3          Abele
4          Abibo
5         Abramo
Name: NAME, dtype: object

In [248]:
s[1:10]

ID
2        Abdone
3         Abele
4         Abibo
5        Abramo
6        Acario
7       Achille
8      Aciscolo
9      Adalardo
10    Adalfredo
Name: NAME, dtype: object

In [249]:
s[::-1]

ID
220      Cunegonda
219     Crocefissa
218      Cristiana
217       Costanza
216       Cornelia
215       Cordelia
214      Consolata
213        Colomba
212       Clorinda
211      Cleopatra
210           Cleo
209         Clelia
208          Clara
207         Cinzia
206        Celinia
205        Cecilia
204         Catena
203      Cassandra
202        Casilda
201         Carola
200        Carmela
199          Carla
198     Capitolina
197        Camilla
196      Calpurnia
195      Cuniberto
194     Cristoforo
193      Cristaldo
192    Crespignano
191      Crescente
          ...     
30       Alcibiade
29           Alceo
28         Alboino
27         Alberto
26          Albano
25           Alano
24         Aladino
23          Aidano
22         Agrippa
21        Agesilao
20          Agazio
19      Agamennone
18            Afro
17         Adriano
16          Adolfo
15          Adelmo
14          Adelfo
13       Adelberto
12            Addo
11        Adalrico
10       Adalfredo
9        