# Sesión 7

#### Casting de datos

In [1]:
# Cargamos las librerías
import pandas as pd

In [2]:
# Cargamos un dataset para los ejercicios
df = pd.read_csv('../Datasets/new_york_times_bestsellers-dirty.csv', index_col= 0)

df.head()

Unnamed: 0,amazon_product_url,author,description,publisher,title,oid,bestsellers_date.numberLong,published_date.numberLong,rank.numberInt,rank_last_week.numberInt,weeks_on_list.numberInt,price.numberDouble
0,http://www.amazon.com/The-Host-Novel-Stephenie...,Stephenie Meyer,Descr: Aliens have taken control of the minds ...,"Little, Brown",THE HOST,5b4aa4ead3089013507db18c,2008-05-24 00:00:00,1212883200000,2,1,3,25.99
1,http://www.amazon.com/Love-Youre-With-Emily-Gi...,Emily Giffin,Descr: A woman's happy marriage is shaken when...,St. Martin's,LOVE THE ONE YOU'RE WITH,5b4aa4ead3089013507db18d,2008-05-24 00:00:00,1212883200000,3,2,2,24.95
2,http://www.amazon.com/The-Front-Garano-Patrici...,Patricia Cornwell,Descr: A Massachusetts state investigator and ...,Putnam,THE FRONT,5b4aa4ead3089013507db18e,2008-05-24 00:00:00,1212883200000,4,0,1,22.95
3,http://www.amazon.com/Snuff-Chuck-Palahniuk/dp...,Chuck Palahniuk,Descr: An aging porn queens aims to cap her ca...,Doubleday,SNUFF,5b4aa4ead3089013507db18f,2008-05-24 00:00:00,1212883200000,5,0,1,24.95
5,http://www.amazon.com/Phantom-Prey-John-Sandfo...,John Sandford,Descr: The Minneapolis detective Lucas Davenpo...,Putnam,PHANTOM PREY,5b4aa4ead3089013507db191,2008-05-24 00:00:00,1212883200000,7,4,3,26.95


In [3]:
# Columnas del df
df.columns

Index(['amazon_product_url', 'author', 'description', 'publisher', 'title',
       'oid', 'bestsellers_date.numberLong', 'published_date.numberLong',
       'rank.numberInt', 'rank_last_week.numberInt', 'weeks_on_list.numberInt',
       'price.numberDouble'],
      dtype='object')

# Diccionario del Dataset

- **amazon_product_url**: URL del producto en Amazon que enlaza a la página de compra del libro.
- **author**: Nombre del autor o autores del libro.
- **description**: Descripción breve del contenido del libro.
- **publisher**: Editorial que publicó el libro.
- **title**: Título del libro.
- **oid**: Identificador único del libro en el sistema.
- **bestsellers_date.numberLong**: Fecha en la que el libro apareció en la lista de bestsellers, en formato de número largo.
- **published_date.numberLong**: Fecha de publicación del libro, en formato de número largo.
- **rank.numberInt**: Rango actual del libro en la lista de bestsellers, en formato de número entero.
- **rank_last_week.numberInt**: Rango del libro en la lista de bestsellers de la semana pasada, en formato de número entero.
- **weeks_on_list.numberInt**: Número de semanas que el libro ha estado en la lista de bestsellers, en formato de número entero.
- **price.numberDouble**: Precio del libro, en formato de número doble (decimal).


In [4]:
# Veamos los tipos de datos que tienen las columnas
df.dtypes

amazon_product_url              object
author                          object
description                     object
publisher                       object
title                           object
oid                             object
bestsellers_date.numberLong     object
published_date.numberLong        int64
rank.numberInt                  object
rank_last_week.numberInt         int64
weeks_on_list.numberInt          int64
price.numberDouble             float64
dtype: object

In [5]:
# Vamos a usar astype para cambiar los tipos de datos
# 'price.numberDouble' esta en float, pero para ejemplo lo cambiaremos a str
df['price.numberDouble'].astype(str)

0       25.99
1       24.95
2       22.95
3       24.95
5       26.95
        ...  
3027    26.95
3028    27.95
3029    27.95
3030    26.95
3031    28.99
Name: price.numberDouble, Length: 2266, dtype: object

In [6]:
# No lo asignamos al cambio entonces sigue con el tipo de datos original
df.dtypes

amazon_product_url              object
author                          object
description                     object
publisher                       object
title                           object
oid                             object
bestsellers_date.numberLong     object
published_date.numberLong        int64
rank.numberInt                  object
rank_last_week.numberInt         int64
weeks_on_list.numberInt          int64
price.numberDouble             float64
dtype: object

In [7]:
# Podemos usar un diccionario para realizar varios cambios simultáneos
diccionario_casting = {'rank_last_week.numberInt': int,
                       'weeks_on_list.numberInt': int,
                       'price.numberDouble': float
}

df.astype(diccionario_casting).dtypes

amazon_product_url              object
author                          object
description                     object
publisher                       object
title                           object
oid                             object
bestsellers_date.numberLong     object
published_date.numberLong        int64
rank.numberInt                  object
rank_last_week.numberInt         int64
weeks_on_list.numberInt          int64
price.numberDouble             float64
dtype: object

In [8]:
# Intentemos con rank.numberInt 
df['rank.numberInt'].astype(int)

ValueError: invalid literal for int() with base 10: 'No Rank'

In [10]:
# Usamos la función unique() para ver un listado de elementos distintos que contiene la columna
# Vemos que no todos son números, tenemos 'No Rank'
df['rank.numberInt'].unique()

array(['2', '3', '4', '5', '7', '8', '9', '10', '12', '13', '14',
       'No Rank', '6', '11', '15', '1', '16'], dtype=object)

In [11]:
# Veamos cuantas veces aparece cada valor de la columna
df['rank.numberInt'].value_counts()


rank.numberInt
No Rank    220
4          147
3          144
10         143
11         137
1          135
13         133
8          133
5          132
15         131
9          129
12         127
6          126
7          123
2          116
14         114
16          76
Name: count, dtype: int64

In [12]:
# Vamos a generar un filtro para saber que filas tienen 'No Rank'
# Primero buscamos los elementos dentro de la columna que coinciden con 'No Rank', obtenemos un listado de True y False
filtro = df['rank.numberInt'] == 'No Rank'
filtro

0       False
1       False
2       False
3       False
5       False
        ...  
3027    False
3028    False
3029    False
3030    False
3031    False
Name: rank.numberInt, Length: 2266, dtype: bool

In [13]:
# Este metodo nos devuelve solo los True encontrados
df[filtro]

Unnamed: 0,amazon_product_url,author,description,publisher,title,oid,bestsellers_date.numberLong,published_date.numberLong,rank.numberInt,rank_last_week.numberInt,weeks_on_list.numberInt,price.numberDouble
13,http://www.amazon.com/The-Host-Novel-Stephenie...,Stephenie Meyer,Descr: Aliens have taken control of the minds ...,"Little, Brown",THE HOST,5b4aa4ead3089013507db1a0,2008-05-31 00:00:00,1213488000000,No Rank,2,4,25.99
72,http://www.amazon.com/The-Broken-Window-Lincol...,Jeffery Deaver,Descr: Detectives Lincoln Rhyme and Amelia Sac...,Simon & Schuster,THE BROKEN WINDOW,5b4aa4ead3089013507db1fa,2008-06-28 00:00:00,1215907200000,No Rank,8,3,26.95
133,http://www.amazon.com/Fearless-Fourteen-Janet-...,Janet Evanovich,Descr: Stephanie Plum and her boyfriend Joe Mo...,St. Martin’s,FEARLESS FOURTEEN,5b4aa4ead3089013507db25e,2008-08-02 00:00:00,1218931200000,No Rank,9,7,27.95
154,http://www.amazon.com/The-Mercedes-Coffin-Deck...,Faye Kellerman,Descr: Decker and Lazarus investigate cases of...,Morrow,THE MERCEDES COFFIN,5b4aa4ead3089013507db282,2008-08-16 00:00:00,1220140800000,No Rank,0,1,25.95
158,http://www.amazon.com/Foreign-Body-Robin-Cook/...,Robin Cook,Descr: A medical student investigates a rising...,Putnam,FOREIGN BODY,5b4aa4ead3089013507db287,2008-08-16 00:00:00,1220140800000,No Rank,9,2,25.95
...,...,...,...,...,...,...,...,...,...,...,...,...
2920,http://www.amazon.com/Twelve-Digital-Edition-V...,Ayana Mathis,Descr: Fifty-some years in the life of an Afri...,Knopf,THE TWELVE TRIBES OF HATTIE,5b4aa4ead3089013507dc4cb,2013-02-09 00:00:00,1361664000000,No Rank,14,9,24.95
2942,http://www.amazon.com/The-Sound-Broken-Glass-K...,Deborah Crombie,"Descr: Detectives Gemma Jones and her husband,...",Morrow/HarperCollins,THE SOUND OF BROKEN GLASS,5b4aa4ead3089013507dc4f3,2013-02-23 00:00:00,1362873600000,No Rank,0,1,25.99
2952,http://www.amazon.com/A-Week-Winter-Maeve-Binc...,Maeve Binchy,Descr: Guests at an inn by the sea on Ireland’...,Knopf,A WEEK IN WINTER,5b4aa4ead3089013507dc503,2013-03-02 00:00:00,1363478400000,No Rank,2,3,26.95
2961,http://www.amazon.com/The-Storyteller-Jodi-Pic...,Jodi Picoult,Descr: A New Hampshire baker finds herself in ...,Emily Bestler/Atria,THE STORYTELLER,5b4aa4ead3089013507dc515,2013-03-09 00:00:00,1364083200000,No Rank,2,2,28.99


In [14]:
# Vamos a tratar estos elementos con to_numeric(columna, errors= opciones)
# Tenemos 3 opciones para errors=
#     - ignore: ignora el contenido, lo deja como estaba
#     - raise: nos da un error y corta la conversion
#     - coerse: cuando encuentra el dato que no es un numero lo convierte a NaN
pd.to_numeric(df['rank.numberInt'], errors= 'coerce')

0        2.0
1        3.0
2        4.0
3        5.0
5        7.0
        ... 
3027     8.0
3028     9.0
3029    11.0
3030    13.0
3031    14.0
Name: rank.numberInt, Length: 2266, dtype: float64

In [15]:
# Ahora que vimos como funciona vamos a guardar los cambios en la columna
df['rank.numberInt'] = pd.to_numeric(df['rank.numberInt'], errors= 'coerce')

# Eliminamos las filas con NaN
df = df.dropna(axis= 0)

# Realizamos un reset de nuestro indice
df = df.reset_index(drop= True)

In [16]:
# Ahora podemos realizar la conversion a int de la columna
df['rank.numberInt'] = df['rank.numberInt'].astype(int)

df.dtypes

amazon_product_url              object
author                          object
description                     object
publisher                       object
title                           object
oid                             object
bestsellers_date.numberLong     object
published_date.numberLong        int64
rank.numberInt                   int64
rank_last_week.numberInt         int64
weeks_on_list.numberInt          int64
price.numberDouble             float64
dtype: object

In [17]:
# Ahora vamos a convertir las columnas con información de fechas
pd.to_datetime(df['bestsellers_date.numberLong'])

0      2008-05-24
1      2008-05-24
2      2008-05-24
3      2008-05-24
4      2008-05-24
          ...    
2041   2013-04-20
2042   2013-04-20
2043   2013-04-20
2044   2013-04-20
2045   2013-04-20
Name: bestsellers_date.numberLong, Length: 2046, dtype: datetime64[ns]

In [18]:
# Hacemos lo mismo con la siguiente columna
pd.to_datetime(df['published_date.numberLong'])

0      1970-01-01 00:20:12.883200
1      1970-01-01 00:20:12.883200
2      1970-01-01 00:20:12.883200
3      1970-01-01 00:20:12.883200
4      1970-01-01 00:20:12.883200
                  ...            
2041   1970-01-01 00:22:47.712000
2042   1970-01-01 00:22:47.712000
2043   1970-01-01 00:22:47.712000
2044   1970-01-01 00:22:47.712000
2045   1970-01-01 00:22:47.712000
Name: published_date.numberLong, Length: 2046, dtype: datetime64[ns]

In [19]:
# tenemos los datos de esta forma porque no especificamos la unidad, para nuestro caso es milisegundos
pd.to_datetime(df['published_date.numberLong'], unit= 'ms')


0      2008-06-08
1      2008-06-08
2      2008-06-08
3      2008-06-08
4      2008-06-08
          ...    
2041   2013-05-05
2042   2013-05-05
2043   2013-05-05
2044   2013-05-05
2045   2013-05-05
Name: published_date.numberLong, Length: 2046, dtype: datetime64[ns]

In [20]:
# Aplicamos los cambios a las columnas
df['bestsellers_date.numberLong'] = pd.to_datetime(df['bestsellers_date.numberLong'])
df['published_date.numberLong'] = pd.to_datetime(df['published_date.numberLong'], unit= 'ms')

df.dtypes

amazon_product_url                     object
author                                 object
description                            object
publisher                              object
title                                  object
oid                                    object
bestsellers_date.numberLong    datetime64[ns]
published_date.numberLong      datetime64[ns]
rank.numberInt                          int64
rank_last_week.numberInt                int64
weeks_on_list.numberInt                 int64
price.numberDouble                    float64
dtype: object

In [21]:
# Reemplazo nombres de columnas
diccionario_renombramiento = {'bestsellers_date.numberLong': 'bestsellers_date',
                              'published_date.numberLong': 'published_date',
                              'rank.numberInt': 'rank',
                              'rank_last_week.numberInt': 'rank_last_week',
                              'weeks_on_list.numberInt': 'weeks_on_list',
                              'price.numberDouble': 'price'
}

df = df.rename(columns=diccionario_renombramiento)

df.head(2)

Unnamed: 0,amazon_product_url,author,description,publisher,title,oid,bestsellers_date,published_date,rank,rank_last_week,weeks_on_list,price
0,http://www.amazon.com/The-Host-Novel-Stephenie...,Stephenie Meyer,Descr: Aliens have taken control of the minds ...,"Little, Brown",THE HOST,5b4aa4ead3089013507db18c,2008-05-24,2008-06-08,2,1,3,25.99
1,http://www.amazon.com/Love-Youre-With-Emily-Gi...,Emily Giffin,Descr: A woman's happy marriage is shaken when...,St. Martin's,LOVE THE ONE YOU'RE WITH,5b4aa4ead3089013507db18d,2008-05-24,2008-06-08,3,2,2,24.95


#### Limpieza de strings

In [22]:
# Veamos el contenido de description
df['description']

0       Descr: Aliens have taken control of the minds ...
1       Descr: A woman's happy marriage is shaken when...
2       Descr: A Massachusetts state investigator and ...
3       Descr: An aging porn queens aims to cap her ca...
4       Descr: The Minneapolis detective Lucas Davenpo...
                              ...                        
2041    Descr: The New York lawyer Stone Barrington di...
2042    Descr: Jake Fisher discovers that neither the ...
2043    Descr: Six friends meet in the 1970s at a summ...
2044    Descr: Bernie Gunther, the Berlin cop, is sent...
2045    Descr: A New Hampshire baker finds herself in ...
Name: description, Length: 2046, dtype: object

In [23]:
# Vamos a quitar 'Descr:' que se repite en todos los comentarios
df['description'] = df['description'].str.replace('Descr:', '')
df['description']

0        Aliens have taken control of the minds and bo...
1        A woman's happy marriage is shaken when she e...
2        A Massachusetts state investigator and his te...
3        An aging porn queens aims to cap her career b...
4        The Minneapolis detective Lucas Davenport inv...
                              ...                        
2041     The New York lawyer Stone Barrington discover...
2042     Jake Fisher discovers that neither the woman ...
2043     Six friends meet in the 1970s at a summer art...
2044     Bernie Gunther, the Berlin cop, is sent to Sm...
2045     A New Hampshire baker finds herself in the mi...
Name: description, Length: 2046, dtype: object

In [24]:
# Eliminamos espacios vacios al principio y final de cada texto
df['description'] = df['description'].str.strip()
df['description']

0       Aliens have taken control of the minds and bod...
1       A woman's happy marriage is shaken when she en...
2       A Massachusetts state investigator and his tea...
3       An aging porn queens aims to cap her career by...
4       The Minneapolis detective Lucas Davenport inve...
                              ...                        
2041    The New York lawyer Stone Barrington discovers...
2042    Jake Fisher discovers that neither the woman h...
2043    Six friends meet in the 1970s at a summer arts...
2044    Bernie Gunther, the Berlin cop, is sent to Smo...
2045    A New Hampshire baker finds herself in the mid...
Name: description, Length: 2046, dtype: object

In [25]:
# Veamos la columna title
df['title']

0                       THE HOST
1       LOVE THE ONE YOU'RE WITH
2                      THE FRONT
3                          SNUFF
4                   PHANTOM PREY
                  ...           
2041     UNINTENDED CONSEQUENCES
2042                   SIX YEARS
2043            THE INTERESTINGS
2044        A MAN WITHOUT BREATH
2045             THE STORYTELLER
Name: title, Length: 2046, dtype: object

In [26]:
# Vamos a cambiar a tipo title los titulos, o sea la primera letra solo en mayuscula
df['title'] = df['title'].str.title()
df['title']

0                       The Host
1       Love The One You'Re With
2                      The Front
3                          Snuff
4                   Phantom Prey
                  ...           
2041     Unintended Consequences
2042                   Six Years
2043            The Interestings
2044        A Man Without Breath
2045             The Storyteller
Name: title, Length: 2046, dtype: object

In [27]:
# Veamos ahora la columna author
df['author']

0         Stephenie Meyer
1            Emily Giffin
2       Patricia Cornwell
3         Chuck Palahniuk
4           John Sandford
              ...        
2041         Stuart Woods
2042         Harlan Coben
2043         Meg Wolitzer
2044          Philip Kerr
2045         Jodi Picoult
Name: author, Length: 2046, dtype: object

In [28]:
# Quiero separar los nombres de los apellidos
# Usando split y el caracter especial '\s' para detectar espacios lo puedo hacer
df['author'].str.split('\s')

0         [Stephenie, Meyer]
1            [Emily, Giffin]
2       [Patricia, Cornwell]
3         [Chuck, Palahniuk]
4           [John, Sandford]
                ...         
2041         [Stuart, Woods]
2042         [Harlan, Coben]
2043         [Meg, Wolitzer]
2044          [Philip, Kerr]
2045         [Jodi, Picoult]
Name: author, Length: 2046, dtype: object

In [29]:
# Si queremos separarlos en columnas usamos el parametro expand= True
df['author'].str.split('\s', expand= True)

Unnamed: 0,0,1
0,Stephenie,Meyer
1,Emily,Giffin
2,Patricia,Cornwell
3,Chuck,Palahniuk
4,John,Sandford
...,...,...
2041,Stuart,Woods
2042,Harlan,Coben
2043,Meg,Wolitzer
2044,Philip,Kerr


In [30]:
# Ya tenemos las 2 columnas, vamos a agregarlas a nuestro df como columnas nuevas
df[['author_first_name', 'author_last_name']] = df['author'].str.split('\s', expand= True)

df.head()

Unnamed: 0,amazon_product_url,author,description,publisher,title,oid,bestsellers_date,published_date,rank,rank_last_week,weeks_on_list,price,author_first_name,author_last_name
0,http://www.amazon.com/The-Host-Novel-Stephenie...,Stephenie Meyer,Aliens have taken control of the minds and bod...,"Little, Brown",The Host,5b4aa4ead3089013507db18c,2008-05-24,2008-06-08,2,1,3,25.99,Stephenie,Meyer
1,http://www.amazon.com/Love-Youre-With-Emily-Gi...,Emily Giffin,A woman's happy marriage is shaken when she en...,St. Martin's,Love The One You'Re With,5b4aa4ead3089013507db18d,2008-05-24,2008-06-08,3,2,2,24.95,Emily,Giffin
2,http://www.amazon.com/The-Front-Garano-Patrici...,Patricia Cornwell,A Massachusetts state investigator and his tea...,Putnam,The Front,5b4aa4ead3089013507db18e,2008-05-24,2008-06-08,4,0,1,22.95,Patricia,Cornwell
3,http://www.amazon.com/Snuff-Chuck-Palahniuk/dp...,Chuck Palahniuk,An aging porn queens aims to cap her career by...,Doubleday,Snuff,5b4aa4ead3089013507db18f,2008-05-24,2008-06-08,5,0,1,24.95,Chuck,Palahniuk
4,http://www.amazon.com/Phantom-Prey-John-Sandfo...,John Sandford,The Minneapolis detective Lucas Davenport inve...,Putnam,Phantom Prey,5b4aa4ead3089013507db191,2008-05-24,2008-06-08,7,4,3,26.95,John,Sandford


#### Map
- map con dataframes se lo llama desde el df y mapea el contenido del mismo con una funcion, o reemplazando valores con un diccionario

In [31]:
# Vamos a usar la columna rank, veamos los elementos que tiene
df['rank'].unique()

array([ 2,  3,  4,  5,  7,  8,  9, 10, 12, 13, 14,  6, 11, 15,  1, 16])

In [32]:
# Creamos un diccionario para mapear rank
int_a_letra = {
    1: 'a',
    2: 'b',
    3: 'c',
    4: 'd',
    5: 'e',
    6: 'f',
    7: 'g',
    8: 'h',
    9: 'i',
    10: 'j',
    11: 'k',
    12: 'l',
    13: 'm',
    14: 'n',
    15: 'o',
    16: 'p'
}

In [33]:
# Reemplazamos los valores con los del diccionario usando map
df['rank'].map(int_a_letra).head(20)

0     b
1     c
2     d
3     e
4     g
5     h
6     i
7     j
8     l
9     m
10    n
11    d
12    f
13    g
14    h
15    i
16    j
17    k
18    l
19    m
Name: rank, dtype: object

In [34]:
# Podriamos hacer lo mismo con una función poco practica pero que demuestre el uso
def funcion_int_a_letra(int_):
    if int_ == 1: return 'a'
    elif int_ == 2: return 'b'
    elif int_ == 3: return 'c'
    elif int_ == 4: return 'd'
    elif int_ == 5: return 'e'
    elif int_ == 6: return 'f'
    elif int_ == 7: return 'g'
    elif int_ == 8: return 'h'
    elif int_ == 9: return 'i'
    elif int_ == 10: return 'j'
    elif int_ == 11: return 'k'
    elif int_ == 12: return 'l'
    elif int_ == 13: return 'm'
    elif int_ == 14: return 'n' 
    elif int_ == 15: return 'o'
    elif int_ == 16: return 'p'

In [35]:
# Hacemos el mismo cambio de números por letras pero ahora usando la función
df['rank'].map(funcion_int_a_letra).head(20)

0     b
1     c
2     d
3     e
4     g
5     h
6     i
7     j
8     l
9     m
10    n
11    d
12    f
13    g
14    h
15    i
16    j
17    k
18    l
19    m
Name: rank, dtype: object

#### Funcion apply()

In [36]:
# Importamos la libreria para las expresiones regex o expresiones regulares
import re

regex_to_find_women = re.compile('(^wom[(a|e)]n\s|\swom[(a|e)]n\s|\swom[(a|e)]n\.)', flags=re.IGNORECASE)
regex_to_find_men = re.compile('(^m[(a|e)]n\s|\sm[(a|e)]n\s|\sm[(a|e)]n\.)', flags=re.IGNORECASE)    #IGNORCASE ignora mayúsculas de minúsculas, es para buscar solo coincidencia de caracteres

In [37]:
# Funcion que busca determinar si en el comentario se mensionan hombres o mujeres
def return_gender_in_description(value):

    women_found = False
    men_found = False
    
    if len(regex_to_find_women.findall(value)) > 0:
        women_found = True
    
    if len(regex_to_find_men.findall(value)) > 0:
        men_found = True
        
    if women_found and men_found:
        return 'Both'
    elif women_found:
        return 'Women'
    elif men_found:
        return 'Men'
    else:
        return 'Neither'

In [38]:
# Hacemos una copia del df
df_copy = df.copy()

In [39]:
# Vamos a usar la función regex en la columna description
df['description'].apply(return_gender_in_description)

0         Women
1       Neither
2       Neither
3           Men
4       Neither
         ...   
2041    Neither
2042      Women
2043    Neither
2044    Neither
2045    Neither
Name: description, Length: 2046, dtype: object

In [40]:
# Vemos una descripción que identifico correctamente
df['description'][0]

'Aliens have taken control of the minds and bodies of most humans, but one woman won’t surrender.'

In [45]:
# Guardamos los resultados
df['gender_in_description'] = df['description'].apply(return_gender_in_description)
df_copy['gender_in_description'] = df['description'].apply(return_gender_in_description)


In [46]:
# Creamos un filtro y lo colocamos dentro de los parentesis
# El filtro nos regresa boleanos y de esta forma solo nos muestra los True
df_copy[df_copy['gender_in_description'] == 'Both']


Unnamed: 0,amazon_product_url,author,description,publisher,title,oid,bestsellers_date,published_date,rank,rank_last_week,weeks_on_list,price,author_first_name,author_last_name,gender_in_description
1308,http://www.amazon.com/The-Priests-Graveyard-Te...,Ted Dekker,A priest and a young woman cross paths as they...,Center Street,The Priest'S Graveyard,5b4aa4ead3089013507dbd73,2011-04-23,2011-05-08,9,0,1,24.99,Ted,Dekker,Both
1496,http://www.amazon.com/The-Best-Me-Nicholas-Spa...,Nicholas Sparks,Twenty-five years after their high school roma...,Grand Central,The Best Of Me,5b4aa4ead3089013507dbf5f,2011-10-15,2011-10-30,1,0,1,25.99,Nicholas,Sparks,Both
1517,http://www.amazon.com/The-Best-Me-Nicholas-Spa...,Nicholas Sparks,Twenty-five years after their high school roma...,Grand Central,The Best Of Me,5b4aa4ead3089013507dbf89,2011-10-29,2011-11-13,3,1,3,25.99,Nicholas,Sparks,Both
1532,http://www.amazon.com/The-Best-Me-Nicholas-Spa...,Nicholas Sparks,Twenty-five years after their high school roma...,Grand Central,The Best Of Me,5b4aa4ead3089013507dbfb2,2011-11-12,2011-11-27,4,3,5,25.99,Nicholas,Sparks,Both
1539,http://www.amazon.com/The-Best-Me-Nicholas-Spa...,Nicholas Sparks,Twenty-five years after their high school roma...,Grand Central,The Best Of Me,5b4aa4ead3089013507dbfc8,2011-11-19,2011-12-04,6,4,6,25.99,Nicholas,Sparks,Both
1547,http://www.amazon.com/The-Best-Me-Nicholas-Spa...,Nicholas Sparks,Twenty-five years after their high school roma...,Grand Central,The Best Of Me,5b4aa4ead3089013507dbfdd,2011-11-26,2011-12-11,7,6,7,25.99,Nicholas,Sparks,Both
1553,http://www.amazon.com/The-Best-Me-Nicholas-Spa...,Nicholas Sparks,Twenty-five years after their high school roma...,Grand Central,The Best Of Me,5b4aa4ead3089013507dbff0,2011-12-03,2011-12-18,6,7,8,25.99,Nicholas,Sparks,Both
1561,http://www.amazon.com/The-Best-Me-Nicholas-Spa...,Nicholas Sparks,Twenty-five years after their high school roma...,Grand Central,The Best Of Me,5b4aa4ead3089013507dc006,2011-12-10,2011-12-25,8,6,9,25.99,Nicholas,Sparks,Both
1568,http://www.amazon.com/The-Best-Me-Nicholas-Spa...,Nicholas Sparks,Twenty-five years after their high school roma...,Grand Central,The Best Of Me,5b4aa4ead3089013507dc018,2011-12-17,2012-01-01,6,8,10,25.99,Nicholas,Sparks,Both
1582,http://www.amazon.com/The-Best-Me-Nicholas-Spa...,Nicholas Sparks,Twenty-five years after their high school roma...,Grand Central,The Best Of Me,5b4aa4ead3089013507dc041,2011-12-31,2012-01-15,7,6,12,25.99,Nicholas,Sparks,Both


In [47]:
# Seleccionamos un elemento, en este caso el de indice 1308
df_copy.loc[1308, 'description']

'A priest and a young woman cross paths as they seek to bring a powerful man to justice.'

In [48]:
# Veamos como se ve un filtro
df_copy['gender_in_description'] == 'Both'

0       False
1       False
2       False
3       False
4       False
        ...  
2041    False
2042    False
2043    False
2044    False
2045    False
Name: gender_in_description, Length: 2046, dtype: bool

In [49]:
# Lo guardamos en una variable
filtro_both = df_copy['gender_in_description'] == 'Both'


In [50]:
# Ahora vamos a contar los elementos filtrados usando el filtro y la funcion count()
df[filtro_both].count()

amazon_product_url       15
author                   15
description              15
publisher                15
title                    15
oid                      15
bestsellers_date         15
published_date           15
rank                     15
rank_last_week           15
weeks_on_list            15
price                    15
author_first_name        15
author_last_name         15
gender_in_description    15
dtype: int64

In [52]:
# Veamos otra función de string
# Vamos a filtrar los autores que comiencen con 'A'
df[df['author_first_name'].str.startswith('A')]

Unnamed: 0,amazon_product_url,author,description,publisher,title,oid,bestsellers_date,published_date,rank,rank_last_week,weeks_on_list,price,author_first_name,author_last_name,gender_in_description
109,http://www.amazon.com/The-Gargoyle-Andrew-Davi...,Andrew Davidson,A hideously burned man is cared for by a sculp...,Doubleday,The Gargoyle,5b4aa4ead3089013507db274,2008-08-09,2008-08-24,14,0,1,25.95,Andrew,Davidson,Men
119,http://www.amazon.com/The-Gargoyle-Andrew-Davi...,Andrew Davidson,A hideously burned man is cared for by a sculp...,Doubleday,The Gargoyle,5b4aa4ead3089013507db28a,2008-08-16,2008-08-31,16,14,2,25.95,Andrew,Davidson,Men
126,http://www.amazon.com/The-Gargoyle-Andrew-Davi...,Andrew Davidson,A hideously burned man is cared for by a sculp...,Doubleday,The Gargoyle,5b4aa4ead3089013507db29b,2008-08-23,2008-09-07,13,16,3,25.95,Andrew,Davidson,Men
202,http://www.amazon.com/Testimony-A-Novel-Anita-...,Anita Shreve,A sex scandal at a Vermont prep school is caug...,"Little, Brown",Testimony,5b4aa4ead3089013507db34a,2008-10-25,2008-11-09,8,0,1,25.99,Anita,Shreve,Neither
211,http://www.amazon.com/Testimony-A-Novel-Anita-...,Anita Shreve,A sex scandal at a Vermont prep school is caug...,"Little, Brown",Testimony,5b4aa4ead3089013507db35f,2008-11-01,2008-11-16,9,8,2,25.99,Anita,Shreve,Neither
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1948,http://www.amazon.com/Twelve-Digital-Edition-V...,Ayana Mathis,Fifty-some years in the life of an African-Ame...,Knopf,The Twelve Tribes Of Hattie,5b4aa4ead3089013507dc48e,2013-01-19,2013-02-03,8,7,6,24.95,Ayana,Mathis,Neither
1958,http://www.amazon.com/Twelve-Digital-Edition-V...,Ayana Mathis,Fifty-some years in the life of an African-Ame...,Knopf,The Twelve Tribes Of Hattie,5b4aa4ead3089013507dc4a4,2013-01-26,2013-02-10,10,8,7,24.95,Ayana,Mathis,Neither
1969,http://www.amazon.com/Twelve-Digital-Edition-V...,Ayana Mathis,Fifty-some years in the life of an African-Ame...,Knopf,The Twelve Tribes Of Hattie,5b4aa4ead3089013507dc4bc,2013-02-02,2013-02-17,14,10,8,24.95,Ayana,Mathis,Neither
1982,http://www.amazon.com/Night-Ranger-John-Wells-...,Alex Berenson,The former C.I.A. operative John Wells pitches...,Putnam,The Night Ranger,5b4aa4ead3089013507dc4e0,2013-02-16,2013-03-03,10,0,1,27.95,Alex,Berenson,Neither


In [54]:
# Vamos a ordenar la información con sort_values()
# En este caso la información de la columna 'price' descendente para valores mayores a 20
df[df['price'] > 20].sort_values('price', ascending=False)

Unnamed: 0,amazon_product_url,author,description,publisher,title,oid,bestsellers_date,published_date,rank,rank_last_week,weeks_on_list,price,author_first_name,author_last_name,gender_in_description
1533,http://www.amazon.com/1Q84-Haruki-Murakami-ebo...,Haruki Murakami,"In 1980s Tokyo, a woman who punishes perpetrat...",Knopf,1Q84,5b4aa4ead3089013507dbfb4,2011-11-12,2011-11-27,6,6,3,30.50,Haruki,Murakami,Women
1541,http://www.amazon.com/1Q84-Haruki-Murakami-ebo...,Haruki Murakami,"In 1980s Tokyo, a woman who punishes perpetrat...",Knopf,1Q84,5b4aa4ead3089013507dbfcc,2011-11-19,2011-12-04,10,6,4,30.50,Haruki,Murakami,Women
1578,http://www.amazon.com/1Q84-Haruki-Murakami-ebo...,Haruki Murakami,"In 1980s Tokyo, a woman who punishes perpetrat...",Knopf,1Q84,5b4aa4ead3089013507dc032,2011-12-24,2012-01-08,12,12,9,30.50,Haruki,Murakami,Women
1548,http://www.amazon.com/1Q84-Haruki-Murakami-ebo...,Haruki Murakami,"In 1980s Tokyo, a woman who punishes perpetrat...",Knopf,1Q84,5b4aa4ead3089013507dbfe2,2011-11-26,2011-12-11,12,10,5,30.50,Haruki,Murakami,Women
1564,http://www.amazon.com/1Q84-Haruki-Murakami-ebo...,Haruki Murakami,"In 1980s Tokyo, a woman who punishes perpetrat...",Knopf,1Q84,5b4aa4ead3089013507dc00b,2011-12-10,2011-12-25,13,11,7,30.50,Haruki,Murakami,Women
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,http://www.amazon.com/Abraham-Lincoln-Vampire-...,Seth Grahame-Smith,Lincoln fights the undead; by the author of “P...,Grand Central,Abraham Lincoln: Vampire Hunter,5b4aa4ead3089013507db926,2010-04-04,2010-04-18,8,6,5,21.99,Seth,Grahame-Smith,Neither
839,http://www.amazon.com/Abraham-Lincoln-Vampire-...,Seth Grahame-Smith,Lincoln fights the undead; by the author of “P...,Grand Central,Abraham Lincoln: Vampire Hunter,5b4aa4ead3089013507db8fb,2010-03-21,2010-04-04,5,5,3,21.99,Seth,Grahame-Smith,Neither
847,http://www.amazon.com/Abraham-Lincoln-Vampire-...,Seth Grahame-Smith,Lincoln fights the undead; by the author of “P...,Grand Central,Abraham Lincoln: Vampire Hunter,5b4aa4ead3089013507db910,2010-03-28,2010-04-11,6,5,4,21.99,Seth,Grahame-Smith,Neither
1142,http://www.amazon.com/Squirrel-Seeks-Chipmunk-...,David Sedaris,The humorist looks at human nature through sto...,"Little, Brown",Squirrel Seeks Chipmunk,5b4aa4ead3089013507dbbd3,2010-11-28,2010-12-12,13,8,9,21.99,David,Sedaris,Neither


In [56]:
# Ordenamos de forma descendente los valores en 'weeks_on_list'
df.sort_values('weeks_on_list', ascending=False).head()

Unnamed: 0,amazon_product_url,author,description,publisher,title,oid,bestsellers_date,published_date,rank,rank_last_week,weeks_on_list,price,author_first_name,author_last_name,gender_in_description
1459,http://www.amazon.com/The-Help-Kathryn-Stocket...,Kathryn Stockett,A young white woman and two black maids in 196...,Amy Einhorn/Putnam,The Help,5b4aa4ead3089013507dbf07,2011-09-10,2011-09-25,13,9,108,24.95,Kathryn,Stockett,Women
1451,http://www.amazon.com/The-Help-Kathryn-Stocket...,Kathryn Stockett,A young white woman and two black maids in 196...,Amy Einhorn/Putnam,The Help,5b4aa4ead3089013507dbeef,2011-09-03,2011-09-18,9,8,107,24.95,Kathryn,Stockett,Women
1440,http://www.amazon.com/The-Help-Kathryn-Stocket...,Kathryn Stockett,A young white woman and two black maids in 196...,Amy Einhorn/Putnam,The Help,5b4aa4ead3089013507dbeda,2011-08-27,2011-09-11,8,4,106,24.95,Kathryn,Stockett,Women
1428,http://www.amazon.com/The-Help-Kathryn-Stocket...,Kathryn Stockett,A young white woman and two black maids in 196...,Amy Einhorn/Putnam,The Help,5b4aa4ead3089013507dbec2,2011-08-20,2011-09-04,4,11,105,24.95,Kathryn,Stockett,Women
1424,http://www.amazon.com/The-Help-Kathryn-Stocket...,Kathryn Stockett,A young white woman and two black maids in 196...,Amy Einhorn/Putnam,The Help,5b4aa4ead3089013507dbeb5,2011-08-13,2011-08-28,11,0,104,24.95,Kathryn,Stockett,Women
