
# NumPy Multi dimensional arrays

In [2]:
import numpy as np

In [3]:
#array de 1 dimension
array1 = np.array([10, 100, 1000])

In [4]:
array2 = np.array([[1,2,3], [4,5,6]])

In [5]:
#array1 continene dos int y un float por lo que numpy transforma todos los valores a float 64 para poder antener la integridad
array1.dtype

dtype('int32')

In [6]:
#el contructor float transforma el float64 (estructura de numpy) a float (estructura de python)
float(array1[0])

10.0

## List comprehension

In [7]:
#list comprehension for
[[i + 1]for i in array1]

[[11], [101], [1001]]

## Vectorization, Broadcast, Scalars

In [8]:
#Vectorization es el acto de operar de un scalar (un string, float unico. No una data structure u objeto de 1 o dos 
#domensiones como una lista) con unobjeto como una lista. puedo hacer array1 + 1 y obtener cada elemento +1 sin iterar  
array1 + 1

array([  11,  101, 1001])

In [9]:
#broadcasting es el acto de extender el array mas corto sobre el mas largo para mantener la compatibilidad de las formas 
#de ambos arrarys

array1 * array2

array([[  10,  200, 3000],
       [  40,  500, 6000]])

## universal functions

In [10]:
#universal functions (ufunc) son los metodos propios de numpy que reemplazan a las funciones de la standard library de python
np.sqrt(array1)


array([ 3.16227766, 10.        , 31.6227766 ])

In [11]:
#axis 0 hace referencia al eje de las rows mientras que axis 1 es el eje de las columnas. Por lo que array2, axis 0 es 
# [1, 2, 3]
#  +  +  +  
# [4, 5, 6]
array2.sum(axis=0)

array([5, 7, 9])

In [12]:
#array2 axis 1 hace referencia a las columnas, por ende
# [1 + 2 + 3], 
# [4 + 5 + 6]
array2.sum(axis=1)

array([ 6, 15])

## slice arrays

In [13]:
#En vez de los indices (o cadena de indices) se utiliza el indice y el argumento con el que se va a hacer el slice
array1[2]

1000

In [14]:
array2[0,0]

1

In [15]:
array2[:,1]

array([2, 5])

In [16]:
array2[1, :2]

array([4, 5])

In [17]:
array2[0, :2]

array([1, 2])

## Constructores utiles

In [18]:
#arange es range de python pero devuelve un objeto de numpy instead. combinado con reshape se puede crear nu arrar
#con dimensiones que se deseen

np.arange(2*5).reshape(2,5) # 2 rows, 5 cols 

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [19]:
#random 
np.random.randn(2, 3) #2 rows, 3 cols

array([[ 0.94867856,  1.10589409,  0.52665027],
       [ 0.02097413,  0.54535971, -0.71233117]])

In [20]:
np.zeros(2)

array([0., 0.])

In [21]:
np.ones(2)

array([1., 1.])

## view vs copy

In [22]:
#numpy devuelve una view cuando se hace un slice de un array. Cambiar el view tambien afecta el valor original
array2

array([[1, 2, 3],
       [4, 5, 6]])

In [23]:
subset = array2[:,:2]
subset

array([[1, 2],
       [4, 5]])

In [24]:
subset[0,0]=1000
subset

array([[1000,    2],
       [   4,    5]])

In [25]:
array2

array([[1000,    2,    3],
       [   4,    5,    6]])

In [26]:
#para que esto no suceda, al momento de hacer el slice (In 83) se debe especificar .copy 
# subset = array2[:, :2].copy 

## Importar pandas y usar un xlsx

In [27]:
import pandas as pd

In [28]:
pd.read_excel("extras/xl/course_participants.xlsx")

Unnamed: 0,user_id,name,age,country,score,continent
0,1001,Mark,55,Italy,4.5,Europe
1,1000,John,33,USA,6.7,America
2,1002,Tim,41,USA,3.9,America
3,1003,Jenny,12,Germany,9.0,Europe


## ingreso manual de data

In [29]:
data = [["mark", 55, "Italy", 4.5, "Europe"], ["john", 33, "usa", 6.7, "america"], ["tim", 41, "usa", 5.5, "america"],
        ["alan", 40, "australia", 5.9, "oceania"], ["jorge", 29, "argentina", 7.5, "america"]]
df = pd.DataFrame(data=data, 
                  columns = ["name", "age", "country", "score", "continent"], index=[1001,1002,1003,1004,1005])
df.index.name = "user_id"

In [30]:
df

Unnamed: 0_level_0,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,mark,55,Italy,4.5,Europe
1002,john,33,usa,6.7,america
1003,tim,41,usa,5.5,america
1004,alan,40,australia,5.9,oceania
1005,jorge,29,argentina,7.5,america


In [31]:
df.index

Int64Index([1001, 1002, 1003, 1004, 1005], dtype='int64', name='user_id')

In [32]:
df.sort_index()

Unnamed: 0_level_0,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,mark,55,Italy,4.5,Europe
1002,john,33,usa,6.7,america
1003,tim,41,usa,5.5,america
1004,alan,40,australia,5.9,oceania
1005,jorge,29,argentina,7.5,america


In [33]:
df.sort_values("age")

Unnamed: 0_level_0,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1005,jorge,29,argentina,7.5,america
1002,john,33,usa,6.7,america
1004,alan,40,australia,5.9,oceania
1003,tim,41,usa,5.5,america
1001,mark,55,Italy,4.5,Europe


In [34]:
df.columns

Index(['name', 'age', 'country', 'score', 'continent'], dtype='object')

In [35]:
df.columns.name = "properties"

In [36]:
df

properties,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,mark,55,Italy,4.5,Europe
1002,john,33,usa,6.7,america
1003,tim,41,usa,5.5,america
1004,alan,40,australia,5.9,oceania
1005,jorge,29,argentina,7.5,america


In [37]:
df.rename(columns={"name":"First Name", "age":"Age"})

properties,First Name,Age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,mark,55,Italy,4.5,Europe
1002,john,33,usa,6.7,america
1003,tim,41,usa,5.5,america
1004,alan,40,australia,5.9,oceania
1005,jorge,29,argentina,7.5,america


In [38]:
df.drop(columns=["name"])

properties,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1001,55,Italy,4.5,Europe
1002,33,usa,6.7,america
1003,41,usa,5.5,america
1004,40,australia,5.9,oceania
1005,29,argentina,7.5,america


In [39]:
df

properties,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,mark,55,Italy,4.5,Europe
1002,john,33,usa,6.7,america
1003,tim,41,usa,5.5,america
1004,alan,40,australia,5.9,oceania
1005,jorge,29,argentina,7.5,america


# Data Manipulation

## Selecting data

In [40]:
df.loc[:,"name"] #Selecciona todos los valores existentes en la columna / serie 
#df.loc[1001,"country"] #un valor especifico de una columna especifica / scalar (por indice)

user_id
1001     mark
1002     john
1003      tim
1004     alan
1005    jorge
Name: name, dtype: object

In [41]:
df.loc[:,["name"]] #devuelve todos los valores de country como un dataframe

properties,name
user_id,Unnamed: 1_level_1
1001,mark
1002,john
1003,tim
1004,alan
1005,jorge


In [42]:
df["name"]

user_id
1001     mark
1002     john
1003      tim
1004     alan
1005    jorge
Name: name, dtype: object

In [43]:
df[["name"]] #shortcut para seleccionar columnas. Como serie (arriba) y como Dataframe (esta linea)

properties,name
user_id,Unnamed: 1_level_1
1001,mark
1002,john
1003,tim
1004,alan
1005,jorge


In [44]:
df.iloc[0,0] #seleccionar elementos mediante el uso de indices en vez de nombre df.iloc (iloc = integer loc)

'mark'

## Seleccion de data mediante booleanos

In [45]:
df

properties,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,mark,55,Italy,4.5,Europe
1002,john,33,usa,6.7,america
1003,tim,41,usa,5.5,america
1004,alan,40,australia,5.9,oceania
1005,jorge,29,argentina,7.5,america


In [49]:
tf = (df["age"] > 20) & (df["continent"]=="oceania")

In [50]:
df.loc[tf]

properties,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1004,alan,40,australia,5.9,oceania


In [51]:
tf = (df["age"] > 20) & (df["continent"].isin(["Europe", "oceania"]))

In [53]:
df.loc[tf]

properties,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,mark,55,Italy,4.5,Europe
1004,alan,40,australia,5.9,oceania


## Seleccion mediante booleanos con sintaxis propia

In [56]:
#Ejemplo de rainfall anual en tres ciudades
rainfall = pd.DataFrame(data={
    "citi 1": [300.1, 100.2],
    "citi 2": [400.3, 300.4],
    "citi 3": [1000.5, 1100.6]
})

In [58]:
rainfall < 400

Unnamed: 0,citi 1,citi 2,citi 3
0,True,False,False
1,True,True,False


In [62]:
rainfall[rainfall < 400]

Unnamed: 0,citi 1,citi 2,citi 3
0,300.1,,
1,100.2,300.4,


## multi index selection

In [63]:
df_multi = df.reset_index().set_index(["continent", "country"])
df_multi = df_multi.sort_index()
df_multi

Unnamed: 0_level_0,properties,user_id,name,age,score
continent,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Europe,Italy,1001,mark,55,4.5
america,argentina,1005,jorge,29,7.5
america,usa,1002,john,33,6.7
america,usa,1003,tim,41,5.5
oceania,australia,1004,alan,40,5.9


# Manipulacion de data

In [64]:
df2 = df.copy()

In [68]:
df2.loc[1002, "name"]="CHUCHU" #se puede seleccionar un valor a cambiar mediante indice y columna

In [69]:
df2

properties,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,mark,55,Italy,4.5,Europe
1002,CHUCHU,33,usa,6.7,america
1003,tim,41,usa,5.5,america
1004,alan,40,australia,5.9,oceania
1005,jorge,29,argentina,7.5,america


In [77]:
#o de a varios
df2.loc[[1001, 1002], "score"]=[9,0]

In [78]:
df2

properties,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,mark,55,Italy,9.0,Europe
1002,xxx,33,usa,0.0,america
1003,xxx,41,usa,5.5,america
1004,alan,40,australia,5.9,oceania
1005,xxx,29,argentina,7.5,america


In [81]:
#set de data por boolean indexing AH RE 
tf = (df2["age"] < 35) | (df2["continent"] == "america")
df2.loc[tf, "name"] = "xxx"


In [82]:
df2

properties,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,mark,55,Italy,9.0,Europe
1002,xxx,33,usa,0.0,america
1003,xxx,41,usa,5.5,america
1004,alan,40,australia,5.9,oceania
1005,xxx,29,argentina,7.5,america


In [84]:
rainfall2 = rainfall.copy()

In [87]:
rainfall2[rainfall2 < 400] = 0

In [88]:
rainfall2

Unnamed: 0,citi 1,citi 2,citi 3
0,0.0,400.3,1000.5
1,0.0,0.0,1100.6


In [91]:
df2

properties,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,mark,55,Italy,9.0,Europe
1002,xxx,33,usa,0.0,america
1003,xxx,41,usa,5.5,america
1004,alan,40,australia,5.9,oceania
1005,xxx,29,argentina,7.5,america


In [104]:
df2 = df2.replace("usa", "US")

In [105]:
df2

properties,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,mark,55,Italy,9.0,Europe
1002,xxx,33,US,0.0,america
1003,xxx,41,US,5.5,america
1004,alan,40,australia,5.9,oceania
1005,xxx,29,argentina,7.5,america
