In [35]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# What is pandas and Data structures

- pandas contanins data structures and data manipulation tools designed to make data cleaning and analysis fast and easy in Python.

- It is designed for working with tabular or heterogeneous data. Numpy, by contrast, is best suited for working with homogeneous numerical array data.

- Its two workhorse data structures are Series and DataFrame.

# Series

- One-dimensional array-like object containing a sequence of values (of similar types to Numpy types) and an associated array of data labels, called its index.

- A way to think about a Series is as a fixed-length, ordered dict, as it is a mapping of index values to data values.

In [111]:
obj = pd.Series([4, 7, -5, 3]) # Since we don't specify an index for the data, a default one consisting of the integers 0 through n-1 is created.
print(obj)
# print(obj.index)
# print(obj.values)

# Assignar indexs a posteriori
obj.index = [1, 2, 3, 4]

# Assignar indexs en definir
obj2 = pd.Series([4, 7, -5, 3], index = ["d", "b", "a", "c"])

# Create a Series from a dictionary
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
obj3 = pd.Series(sdata)
print(obj3)

# Redefinir indexs, canviant ordre, afegint, traient

# Opció 1, en la creació
states = ["California", "Ohio", "Oregon", "Texas"]
obj4 = pd.Series(sdata, index = states)
print(obj4)

# Opció 2, després en un nou objecte
obj5 = obj4.reindex(["California", "Oregon", "Ohio", "Utah"]) # Si no estava l'índex en l'anterior, col·loca NaN
#print(obj5)

obj6 = obj4.reindex(["California", "Oregon", "Ohio", "Utah"], method = "ffill") # Si hi ha valors NaN, interpola per poder posar algun valor.
print(obj6)

# Series object and its index have a "name" attribute.
obj4.name = "population"
obj4.index.name = "state"
print(obj4)


0    4
1    7
2   -5
3    3
dtype: int64
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
California        NaN
Oregon        16000.0
Ohio          35000.0
Utah          71000.0
dtype: float64
state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64


## Series: Operations

- You can use Numpy functions or Numpy-like operations.

- Arithmetic operations.

- isnull and notnull to detect missing data

- drop to delete an entry

In [122]:
cond = obj2[obj2 > 0] # Filtrar
print(cond)

print("Els elements seleccionats, els dos últims són:\n", obj2[-2:]) # Slicing
print("Els elements seleccionats són de b a c:\n", obj2["b":"c"]) # Slicing per índexs, INCLOU INICI I FI
print("L'element b és:", obj2["b"])

print(obj2 * 2)

"b" in obj2 # True. Saber si un índex hi és en la Serie

print(pd.isnull(obj4)) # Detectar missing data

print(obj3 + obj4) # Sumar els valors que tenen el mateix índex, els que no pot sumar, els converteix en NaN.

# Eliminar un valor
# del obj2["a"]

# Nova Serie havent eliminat valors 
obje = obj2.drop(["b", "c"]) # usar ,inplace = True per aplicar-ho al propi objecte


d    4
b    7
c    3
dtype: int64
Els elements seleccionats, els dos últims són:
 a   -5
c    3
dtype: int64
Els elements seleccionats són de b a c:
 b    7
a   -5
c    3
dtype: int64
L'element b és: 7
d     8
b    14
a   -10
c     6
dtype: int64
state
California     True
Ohio          False
Oregon        False
Texas         False
Name: population, dtype: bool
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64


# DataFrame

- A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.). The DataFrame has both row and column index.

In [74]:
# OPCIÓ 1: From a dict of equal-length lists or ndarrays from Numpy

data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
"year": [2000, 2001, 2002, 2001, 2002, 2003],
"pop": np.array([1.5, 1.7, 3.6, 2.4, 2.9, 3.2])}

frame = pd.DataFrame(data)
print(frame)
# print(frame.index) # Només els índexs de les files. En aquest cas, no definits, 0,1,2,3,4,5
# print(frame.values) # Només els valors
# print(frame.columns) # Només els índexs de les columnes
# print(frame["state"]) # Seleccionar una columna
# print(frame[0:2]) # Seleccionar les dues primeres columnes
# print(frame.loc[0]) # Seleccionar una fila

#print(frame.head()) # to view only the first five rows

# Definir indexs, canviar ordre de les columnes...
frame2 = pd.DataFrame(data, index = ["one", "two", "three", "four", "five", "six"] , columns = ["year", "state", "pop"])
# print(frame2)

# OPCIÓ 2: From a nested dict of dicts

popdata = {"Nevada": {2001: 2.4, 2002: 2.9}, "Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(popdata, index = [2000, 2001, 2002]) # the outer dict keys are the columns and the inner keys are the rows indices. He afegit index = per ordenar els anys al meu gust.
print(frame3)

# index and columns have their name attributes

frame3.index.name = "year"
frame3.columns.name = "state"

print(frame3)


    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2
      Nevada  Ohio
2000     NaN   1.5
2001     2.4   1.7
2002     2.9   3.6
state  Nevada  Ohio
year               
2000      NaN   1.5
2001      2.4   1.7
2002      2.9   3.6


## DataFrame: Modifications

In [126]:
popdata = {"Nevada": {2001: 2.4, 2002: 2.9}, "Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(popdata, index = [2000, 2001, 2002]) 

# Modificar valors per nova assignació
frame3["Nevada"] = [2.1, 2.2, 2.8]
frame3["Ohio"][2000] = 1.3
print(frame3)

# Modificar tota una columna a partir d'una Serie. Si no es posen tots els indexs, posa NaN.
val = pd.Series([2.6, 3], index=[2001, 2002])
frame3["Nevada"] = val
print(frame3)

# Afegir una nova columna
frame3["California"] = [1, 2, 3]
print(frame3)

frame3["Exceed_Ohio"] = frame3["Ohio"] > 2.5 # Nova columna booleana
print(frame3)

# Eliminar una columna
del frame3["Nevada"]

# Crear un nou objecte sense alguna columna o fila
framec = frame3.drop("Ohio", axis = 1) # usar ,inplace = True per aplicar-ho al propi objecte
framef = frame3.drop(2000)

# Transposar el DataFrame
print(frame3.T)

# Saber si una fila o columna hi és en la taula
"Ohio" in frame3.columns # True
"2005" in frame3.index # False

# Mostrar True/False si data compleix certes condicions
print(frame3 < 2)

# Mostrar la data que compleix certes condicions
print(frame3[frame3["Ohio"] < 2])


      Nevada  Ohio
2000     2.1   1.3
2001     2.2   1.7
2002     2.8   3.6
      Nevada  Ohio
2000     NaN   1.3
2001     2.6   1.7
2002     3.0   3.6
      Nevada  Ohio  California
2000     NaN   1.3           1
2001     2.6   1.7           2
2002     3.0   3.6           3
      Nevada  Ohio  California  Exceed_Ohio
2000     NaN   1.3           1        False
2001     2.6   1.7           2        False
2002     3.0   3.6           3         True
              2000   2001  2002
Ohio           1.3    1.7   3.6
California       1      2     3
Exceed_Ohio  False  False  True
       Ohio  California  Exceed_Ohio
2000   True        True         True
2001   True       False         True
2002  False       False         True
      Ohio  California  Exceed_Ohio
2000   1.3           1        False
2001   1.7           2        False
