# Transformación de datos

### <font color=#845600> Filtrado y selección de datos

**Librerías**

In [2]:
import pandas as pd

**Importado de datos**

In [3]:
df = pd.read_csv("nycflights.csv")
df.head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
0,2013,6,30,940,15,1216,-4,VX,N626VA,407,JFK,LAX,313,2475,9,40
1,2013,5,7,1657,-3,2104,10,DL,N3760C,329,JFK,SJU,216,1598,16,57
2,2013,12,8,859,-1,1238,11,DL,N712TW,422,JFK,LAX,376,2475,8,59
3,2013,5,14,1841,-4,2122,-34,DL,N914DL,2391,JFK,TPA,135,1005,18,41
4,2013,7,21,1102,-3,1230,-8,9E,N823AY,3652,LGA,ORF,50,296,11,2


#### <font color=#845600> Filtro a través del valor de una columna

In [4]:
df1 = df[df["origin"]=="LGA"]
df1.head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
4,2013,7,21,1102,-3,1230,-8,9E,N823AY,3652,LGA,ORF,50,296,11,2
5,2013,1,1,1817,-3,2008,3,AA,N3AXAA,353,LGA,ORD,138,733,18,17
8,2013,9,26,725,-10,1027,-8,AA,N3FSAA,2279,LGA,MIA,148,1096,7,25
11,2013,11,22,1320,5,1628,-2,B6,N526JB,1639,LGA,RSW,161,1080,13,20
13,2013,3,25,2054,115,2256,91,FL,N919AT,645,LGA,ATL,104,762,20,54


In [None]:
df1 = df[df["origin"]!="LGA"]
df1.head()

In [None]:
# & equivale a "AND"
df1 = df[(df["origin"]=="LGA") & (df["year"]==2013) & (df["air_time"]<=50)]
df1.head()

In [None]:
# | equivale a "OR"
df1 = df[(df["month"]==5) | (df["month"]==12)]
df1.head()

Uso de una máscara para filtrar datos

In [10]:
mask = (df['origin'] == 'LGA') & (df['month'] == 12)
df[mask][['year','origin','month']].head()

Unnamed: 0,year,origin,month
34,2013,LGA,12
37,2013,LGA,12
49,2013,LGA,12
50,2013,LGA,12
68,2013,LGA,12


Es posible usar otras variables para filtrar datos:

In [11]:
month_var = 5
df1 = df[(df["month"] == month_var) & (df["air_time"] > 50)]
df1.head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
1,2013,5,7,1657,-3,2104,10,DL,N3760C,329,JFK,SJU,216,1598,16,57
3,2013,5,14,1841,-4,2122,-34,DL,N914DL,2391,JFK,TPA,135,1005,18,41
36,2013,5,13,1825,-4,2000,-35,DL,N919DE,2131,LGA,DTW,77,502,18,25
63,2013,5,26,1417,-8,1529,-21,B6,N184JB,8,JFK,BUF,56,301,14,17
67,2013,5,25,1610,-5,1827,-13,MQ,N525MQ,4657,LGA,ATL,102,762,16,10


Se puede negar cualquier criterio de filtro

In [4]:
month_var = 5
df1 = df[~((df["month"] == month_var) & (df["air_time"]>50))]
df1.head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
0,2013,6,30,940,15,1216,-4,VX,N626VA,407,JFK,LAX,313,2475,9,40
2,2013,12,8,859,-1,1238,11,DL,N712TW,422,JFK,LAX,376,2475,8,59
4,2013,7,21,1102,-3,1230,-8,9E,N823AY,3652,LGA,ORF,50,296,11,2
5,2013,1,1,1817,-3,2008,3,AA,N3AXAA,353,LGA,ORD,138,733,18,17
6,2013,12,9,1259,14,1617,22,WN,N218WN,1428,EWR,HOU,240,1411,12,59


#### <font color=#845600> Filtro a través de la función loc

In [15]:
df1 = df.loc[df["origin"]=="LGA"]
df1.head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
4,2013,7,21,1102,-3,1230,-8,9E,N823AY,3652,LGA,ORF,50,296,11,2
5,2013,1,1,1817,-3,2008,3,AA,N3AXAA,353,LGA,ORD,138,733,18,17
8,2013,9,26,725,-10,1027,-8,AA,N3FSAA,2279,LGA,MIA,148,1096,7,25
11,2013,11,22,1320,5,1628,-2,B6,N526JB,1639,LGA,RSW,161,1080,13,20
13,2013,3,25,2054,115,2256,91,FL,N919AT,645,LGA,ATL,104,762,20,54


In [None]:
df1 = df.loc[df["origin"]!="LGA"]
df1.head()

In [None]:
# & equivale a "AND"
df1 = df.loc[(df["origin"]=="LGA") & (df["year"]==2013) & (df["air_time"]<=50)]
df1.head()

In [None]:
# | equivale a "OR"
df1 = df.loc[(df["month"]==5) | (df["month"]==12)]
df1.head()

Es posible usar otras variables para filtrar datos

In [None]:
month_var = 5
df1 = df.loc[(df["month"] == month_var) & (df["air_time"]>50)]
df1.head()

Se puede negar cualquier criterio de filtro

In [6]:
month_var = 5
df1 = df.loc[~((df["month"] == month_var) & (df["air_time"]>50))]
df1.head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
0,2013,6,30,940,15,1216,-4,VX,N626VA,407,JFK,LAX,313,2475,9,40
2,2013,12,8,859,-1,1238,11,DL,N712TW,422,JFK,LAX,376,2475,8,59
4,2013,7,21,1102,-3,1230,-8,9E,N823AY,3652,LGA,ORF,50,296,11,2
5,2013,1,1,1817,-3,2008,3,AA,N3AXAA,353,LGA,ORD,138,733,18,17
6,2013,12,9,1259,14,1617,22,WN,N218WN,1428,EWR,HOU,240,1411,12,59


#### <font color=#845600> Filtro mediante el método **query**

In [4]:
df.head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
0,2013,6,30,940,15,1216,-4,VX,N626VA,407,JFK,LAX,313,2475,9,40
1,2013,5,7,1657,-3,2104,10,DL,N3760C,329,JFK,SJU,216,1598,16,57
2,2013,12,8,859,-1,1238,11,DL,N712TW,422,JFK,LAX,376,2475,8,59
3,2013,5,14,1841,-4,2122,-34,DL,N914DL,2391,JFK,TPA,135,1005,18,41
4,2013,7,21,1102,-3,1230,-8,9E,N823AY,3652,LGA,ORF,50,296,11,2


In [21]:
df.query('carrier == "DL" & dep_delay >= 350')

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
731,2013,8,11,100,360,318,335,DL,N3750D,2454,JFK,DEN,236,1626,1,0
7277,2013,12,19,734,849,1046,847,DL,N375NC,1223,EWR,SLC,290,1969,7,34
10919,2013,9,2,2248,358,31,332,DL,N333NB,1473,LGA,MEM,135,963,22,48
18146,2013,5,23,2136,351,113,372,DL,N371NW,141,JFK,FLL,152,1069,21,36
20076,2013,8,28,2315,436,158,399,DL,N3763D,1373,JFK,MIA,134,1089,23,15
23945,2013,7,7,2347,362,135,325,DL,N3730B,884,LGA,DEN,201,1620,23,47
26639,2013,6,27,615,790,853,769,DL,N372DA,503,JFK,SAN,312,2446,6,15
32211,2013,6,28,121,502,329,490,DL,N360NB,2042,EWR,ATL,106,746,1,21


#### <font color=#845600> Filtro por posición de filas y columnas

In [None]:
df.iloc[:4] #First 4 rows, all columns
df.iloc[1:5,] #Second to fifth row
df.iloc[5,0] #Sixth row and first column
df.iloc[1:5,0] #Second to Fifth row, first column
df.iloc[1:5,:5] #Second to Fifth row, first 5 columns
df.iloc[2:7,1:3] #Third to Seventh row, 2nd and 3rd column

#### <font color=#845600> Filtro por posición de filas y nombre de columnas

In [None]:
# Selección de 5 filas y columnas con nombre "origin" y "distance"
df1 = df.loc[df.index[10:16],["origin","distance"]]
df1.head()

In [None]:
# Filtro múltiple
df1 = df.loc[(df["origin"]=="LGA") & (df["year"]==2013) & (df["air_time"]<=50),
             ["origin","distance","year"]]
df1.head()

#### <font color=#845600> Filtro para seleccionar múltiples valores

In [3]:
df1 = df[df["origin"].isin(["JFK", "LGA"])]
df1.head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
0,2013,6,30,940,15,1216,-4,VX,N626VA,407,JFK,LAX,313,2475,9,40
1,2013,5,7,1657,-3,2104,10,DL,N3760C,329,JFK,SJU,216,1598,16,57
2,2013,12,8,859,-1,1238,11,DL,N712TW,422,JFK,LAX,376,2475,8,59
3,2013,5,14,1841,-4,2122,-34,DL,N914DL,2391,JFK,TPA,135,1005,18,41
4,2013,7,21,1102,-3,1230,-8,9E,N823AY,3652,LGA,ORF,50,296,11,2


Los valores que se deben conservar pueden ser almacenados en una lista

In [None]:
variables = ["JFK","LGA"]
df1 = df[df["origin"].isin(variables)]
df1.head()

Los valores que **no** se deben conservar pueden ser almacenados en una lista

In [None]:
remove_var = ["JFK","LGA"]
df1 = df[~df["origin"].isin(remove_var)]
df1.head()

#### <font color=#845600> Filtar columnas

In [11]:
# Selección de columna: 'flight' hasta 'dest' para todas las filas
df1 = df.loc[:,"flight":"dest"]
df1.head()

Unnamed: 0,flight,origin,dest
0,407,JFK,LAX
1,329,JFK,SJU
2,422,JFK,LAX
3,2391,JFK,TPA
4,3652,LGA,ORF


In [9]:
# Selección de columna: 'flight' y 'dest'
df1 = df.loc[:,["flight","dest"]]
df1.head()

Unnamed: 0,flight,dest
0,407,LAX
1,329,SJU
2,422,LAX
3,2391,TPA
4,3652,ORF


In [8]:
# Selección de columna: 'flight' y 'dest'
df1 = df[["flight","dest"]]
df1.head()

Unnamed: 0,flight,dest
0,407,LAX
1,329,SJU
2,422,LAX
3,2391,TPA
4,3652,ORF


In [37]:
# Selección de columnas diferentes de 'flight', 'dest' y 'year'
# También es posible eliminar filas si fuese necesario
df1 =df.drop(['flight','dest','year'], axis = 1)
df1.head()

Unnamed: 0,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,origin,air_time,distance,hour,minute
0,6,30,940,15,1216,-4,VX,N626VA,JFK,313,2475,9,40
1,5,7,1657,-3,2104,10,DL,N3760C,JFK,216,1598,16,57
2,12,8,859,-1,1238,11,DL,N712TW,JFK,376,2475,8,59
3,5,14,1841,-4,2122,-34,DL,N914DL,JFK,135,1005,18,41
4,7,21,1102,-3,1230,-8,9E,N823AY,LGA,50,296,11,2


#### <font color=#845600> FIltros con el método "query"

In [1]:
import pandas as pd

In [4]:
df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


Primero usamos el método .loc para visulaizar el resultado deseado

In [5]:
mask = (df['sex']=='Male') & (df['size'] >2)
df.loc[mask].head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
5,25.29,4.71,Male,No,Sun,Dinner,4
7,26.88,3.12,Male,No,Sun,Dinner,4
13,18.43,3.0,Male,No,Sun,Dinner,4


Ahora mostramos el mismo resultado pero utilizando el método query. Nótese que los criterios de texto deben escribirse entre comillas dobles ""

In [9]:
df.query('sex=="Male" & size > 2').head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
5,25.29,4.71,Male,No,Sun,Dinner,4
7,26.88,3.12,Male,No,Sun,Dinner,4
13,18.43,3.0,Male,No,Sun,Dinner,4


Es posible usar listas para múltiples criterios de filtro

In [11]:
days = ["Sat", "Sun"]
df.query('sex=="Male" & size>2 & day == @days').head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
5,25.29,4.71,Male,No,Sun,Dinner,4
7,26.88,3.12,Male,No,Sun,Dinner,4
13,18.43,3.0,Male,No,Sun,Dinner,4


#### <font color=#845600> Filtar valores duplicados

In [3]:
df = pd.read_csv("nycflights.csv")
df.head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
0,2013,6,30,940,15,1216,-4,VX,N626VA,407,JFK,LAX,313,2475,9,40
1,2013,5,7,1657,-3,2104,10,DL,N3760C,329,JFK,SJU,216,1598,16,57
2,2013,12,8,859,-1,1238,11,DL,N712TW,422,JFK,LAX,376,2475,8,59
3,2013,5,14,1841,-4,2122,-34,DL,N914DL,2391,JFK,TPA,135,1005,18,41
4,2013,7,21,1102,-3,1230,-8,9E,N823AY,3652,LGA,ORF,50,296,11,2


Devuelve aquellos elementos que tienen valores duplicados. Así, la tabla resultante contendrá valores únicos según los parámetros de la función.

In [4]:
# keep: If 'first', it considers first value as unique and rest of the same values as duplicate.
# keep: If 'last', it considers last value as unique and rest of the same values as duplicate.
# keep: If False, it consider all of the same values as duplicates
# inplace: Boolean values, removes rows with duplicates if True.
# ignore_indexbool, default False. If True, the resulting axis will be labeled 0, 1, …, n - 1.

df1 = df.drop_duplicates(subset = "origin",
                         keep = 'first',
                         inplace = False,
                         ignore_index = False)

df1.head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
0,2013,6,30,940,15,1216,-4,VX,N626VA,407,JFK,LAX,313,2475,9,40
4,2013,7,21,1102,-3,1230,-8,9E,N823AY,3652,LGA,ORF,50,296,11,2
6,2013,12,9,1259,14,1617,22,WN,N218WN,1428,EWR,HOU,240,1411,12,59


Se pueden buscar valores duplicados en una combinación de varias columnas

In [87]:
# keep: If 'first', it considers first value as unique and rest of the same values as duplicate.
# keep: If 'last', it considers last value as unique and rest of the same values as duplicate.
# keep: If False, it consider all of the same values as duplicates
# inplace: Boolean values, removes rows with duplicates if True.
# ignore_indexbool, default False. If True, the resulting axis will be labeled 0, 1, …, n - 1.

df = pd.DataFrame({'Name'       : ['Carlos', 'Andrés', 'Alejandro', 'Santiago'],
                   'Age'        : [23, 24, 24, 25],
                   'University' : ['AA', 'BB', 'BB', 'DD']})

df1 = df.drop_duplicates(subset = ["Age","University"],
                         keep = False,
                         inplace = False,
                         ignore_index = True)

df1.head()

Unnamed: 0,Name,Age,University
0,Carlos,23,AA
1,Santiago,25,DD


Conservar **solamente** los valores duplicados

In [95]:
# keep{‘first’, ‘last’, False}, default ‘first’
# Determines which duplicates (if any) to mark.
# first : Mark duplicates as True except for the first occurrence.
# last : Mark duplicates as True except for the last occurrence.
# False : Mark all duplicates as True.

df = pd.DataFrame({'Name'       : ['Carlos', 'Andrés', 'Alejandro', 'Santiago'],
                   'Age'        : [23, 24, 24, 25],
                   'University' : ['AA', 'BB', 'BB', 'BB']})

df1 = df[df[["Age","University"]].duplicated(keep = False)]
df1.head()

Unnamed: 0,Name,Age,University
1,Andrés,24,BB
2,Alejandro,24,BB


#### <font color=#845600> Filtrar valores faltantes

In [50]:
# La librería "numpy" proporciona una forma para generar un valor vacío
import numpy as np

# axis
# axis{0 or ‘index’, 1 or ‘columns’}, default 0
# Determine if rows or columns which contain missing values are removed.
# 0, or ‘index’ : Drop rows which contain missing values.
# 1, or ‘columns’ : Drop columns which contain missing value.

# how
# how: {‘any’, ‘all’}, default ‘any’
# Determine if row or column is removed from DataFrame, when we have at least one NA or all NA.
# ‘any’ : If any NA values are present, drop that row or column.
# ‘all’ : If all values are NA, drop that row or column.

# subset
# subset: column label or sequence of labels, optional
# Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include.

# inplace
# inplace: bool, default False
# If True, do operation inplace and return None.

# La función dropna y notnull pueden identificar a los valores de np.nan, None y pd.NA

df = pd.DataFrame({'Name'       : ['Carlos', 'Andrés', np.nan, 'Santiago', "Fernand0", "Marcelo", np.nan],
                   'Age'        : [23, 24, 24, 25, None, 27, None],
                   'University' : ['AA', pd.NA, 'BB', None, 'CC','EE', pd.NA]})
#df1 = df.dropna(axis = 0, how = "all")
df1 = df.dropna(axis = 0, how = "any")
df1.head(10)

Unnamed: 0,Name,Age,University
0,Carlos,23.0,AA
5,Marcelo,27.0,EE


Cuenta de los valores no nulos en el data frame

In [30]:
column_notnull = df.notnull().sum()
column_notnull

Name          5
Age           5
University    4
dtype: int64

#### <font color=#845600> Filtrar filas según una cadena de texto

In [12]:
df = pd.DataFrame({'Name'       : ['Carlos', 'Andrés', np.nan, 'Carmina', "Fernand0", "Marcelo", np.nan],
                   'Age'        : [23, 24, 24, 25, None, 27, None],
                   'University' : ['AA', pd.NA, 'BB', None, 'CC','EE', pd.NA]})

# Filtrar filas de la columna 'Name' cuya primera letra es 'C'
df1 = df[df['Name'].str[0] == 'C' ]
df1.head()

Unnamed: 0,Name,Age,University
0,Carlos,23.0,AA
3,Carmina,25.0,


In [15]:
df = pd.DataFrame({'Name'       : ['Carlos', 'Andrés', np.nan, 'Carmina', "Fernand0", "Marcelo", np.nan],
                   'Age'        : [23, 24, 24, 25, None, 27, None],
                   'University' : ['AA', pd.NA, 'BB', None, 'CC','EE', pd.NA]})

# Filtrar filas de la columna 'Name' cuyas dos primeras letras es 'Ca'
df1 = df[df['Name'].str[0:2] == 'Ca' ]
df1.head()

Unnamed: 0,Name,Age,University
0,Carlos,23.0,AA
3,Carmina,25.0,


In [36]:
df['Name'].str[0:5]

0    Carlo
1    André
2      NaN
3    Carmi
4    Ferna
5    Marce
6      NaN
Name: Name, dtype: object

In [55]:
df = pd.DataFrame({'Name'       : ['Carlos', 'Andrés', np.nan, 'Carmina', "Fernand0", "Marcelo", np.nan],
                   'Age'        : [23, 24, 24, 25, None, 27, None],
                   'University' : ['AA', pd.NA, 'BB', None, 'CC','EE', pd.NA]})

# Filtrar filas de la columna 'Name' cuyas dos primeras letras es 'Ca'
df1 = df[df['Name'].str.startswith("Ca") == True]
df1.head()

Unnamed: 0,Name,Age,University
0,Carlos,23.0,AA
3,Carmina,25.0,


In [57]:
df = pd.DataFrame({'Name'       : ['Carlos', 'Andrés', np.nan, 'Carmina', "Fernand0", "Marcos", np.nan],
                   'Age'        : [23, 24, 24, 25, None, 27, None],
                   'University' : ['AA', pd.NA, 'BB', None, 'CC','EE', pd.NA]})

# Filtrar filas de la columna 'Name' cuyas dos últimas letras es 'os'
df1 = df[df['Name'].str.endswith("os") == True]
df1.head()

Unnamed: 0,Name,Age,University
0,Carlos,23.0,AA
5,Marcos,27.0,EE


In [51]:
x = df["Name"]
x.str.startswith("Ca")

0     True
1    False
2      NaN
3     True
4    False
5    False
6      NaN
Name: Name, dtype: object

#### <font color=#845600> Filtrar los valores más grandes

In [25]:
df.sort_values(by = ['dep_delay'], ascending = False).head(5)

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
30381,2013,1,9,641,1301,1242,1272,HA,N384HA,51,JFK,HNL,640,4983,6,41
7277,2013,12,19,734,849,1046,847,DL,N375NC,1223,EWR,SLC,290,1969,7,34
18766,2013,6,27,753,803,937,802,AA,N571AA,2019,LGA,STL,134,888,7,53
26639,2013,6,27,615,790,853,769,DL,N372DA,503,JFK,SAN,312,2446,6,15
32211,2013,6,28,121,502,329,490,DL,N360NB,2042,EWR,ATL,106,746,1,21


In [27]:
df.nlargest(5,'dep_delay')

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
30381,2013,1,9,641,1301,1242,1272,HA,N384HA,51,JFK,HNL,640,4983,6,41
7277,2013,12,19,734,849,1046,847,DL,N375NC,1223,EWR,SLC,290,1969,7,34
18766,2013,6,27,753,803,937,802,AA,N571AA,2019,LGA,STL,134,888,7,53
26639,2013,6,27,615,790,853,769,DL,N372DA,503,JFK,SAN,312,2446,6,15
32211,2013,6,28,121,502,329,490,DL,N360NB,2042,EWR,ATL,106,746,1,21


#### <font color=#845600> Filtrar los valores más pequeños

In [28]:
df.sort_values(by = ['dep_delay'], ascending = True).head(5)

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
27076,2013,9,21,1539,-21,1824,-42,B6,N789JB,1117,LGA,TPA,141,1010,15,39
24194,2013,8,21,1804,-21,2015,-17,DL,N346NB,1715,LGA,MSY,157,1183,18,4
8285,2013,11,14,1038,-21,1218,-36,MQ,N853MQ,3281,LGA,CMH,74,479,10,38
21806,2013,11,10,2046,-20,2157,-16,B6,N328JB,2680,EWR,BOS,46,200,20,46
4505,2013,3,16,1800,-20,2128,-7,AS,N552AS,7,EWR,SEA,352,2402,18,0


In [30]:
df.nsmallest(5,'dep_delay')

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
8285,2013,11,14,1038,-21,1218,-36,MQ,N853MQ,3281,LGA,CMH,74,479,10,38
24194,2013,8,21,1804,-21,2015,-17,DL,N346NB,1715,LGA,MSY,157,1183,18,4
27076,2013,9,21,1539,-21,1824,-42,B6,N789JB,1117,LGA,TPA,141,1010,15,39
4505,2013,3,16,1800,-20,2128,-7,AS,N552AS,7,EWR,SEA,352,2402,18,0
21806,2013,11,10,2046,-20,2157,-16,B6,N328JB,2680,EWR,BOS,46,200,20,46
