# Preprocesamiento de un Dataframe

## Tipos de variables en un dataframe

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/meteorite-landings.csv')
df

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation
0,Aachen,1,Valid,L5,21.0,Fell,1880.0,50.77500,6.08333,"(50.775000, 6.083330)"
1,Aarhus,2,Valid,H6,720.0,Fell,1951.0,56.18333,10.23333,"(56.183330, 10.233330)"
2,Abee,6,Valid,EH4,107000.0,Fell,1952.0,54.21667,-113.00000,"(54.216670, -113.000000)"
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976.0,16.88333,-99.90000,"(16.883330, -99.900000)"
4,Achiras,370,Valid,L6,780.0,Fell,1902.0,-33.16667,-64.95000,"(-33.166670, -64.950000)"
...,...,...,...,...,...,...,...,...,...,...
45711,Zillah 002,31356,Valid,Eucrite,172.0,Found,1990.0,29.03700,17.01850,"(29.037000, 17.018500)"
45712,Zinder,30409,Valid,"Pallasite, ungrouped",46.0,Found,1999.0,13.78333,8.96667,"(13.783330, 8.966670)"
45713,Zlin,30410,Valid,H4,3.3,Found,1939.0,49.25000,17.66667,"(49.250000, 17.666670)"
45714,Zubkovsky,31357,Valid,L6,2167.0,Found,2003.0,49.78917,41.50460,"(49.789170, 41.504600)"


In [3]:
# Numero de filas y columnas
df.shape

(45716, 10)

In [4]:
# Total de los datos
df.size

457160

In [5]:
# Conocer la información estadistica de las variables
df.describe()

Unnamed: 0,id,mass,year,reclat,reclong
count,45716.0,45585.0,45428.0,38401.0,38401.0
mean,26889.735104,13278.08,1991.772189,-39.12258,61.074319
std,16860.68303,574988.9,27.181247,46.378511,80.647298
min,1.0,0.0,301.0,-87.36667,-165.43333
25%,12688.75,7.2,1987.0,-76.71424,0.0
50%,24261.5,32.6,1998.0,-71.5,35.66667
75%,40656.75,202.6,2003.0,0.0,157.16667
max,57458.0,60000000.0,2501.0,81.16667,354.47333


In [6]:
# Conocer la información estadistica de las variables inlcuyendo varibles catgoricas
df.describe(include='all')

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation
count,45716,45716.0,45716,45716,45585.0,45716,45428.0,38401.0,38401.0,38401
unique,45716,,2,466,,2,,,,17100
top,Asuka 881161,,Valid,L6,,Found,,,,"(0.000000, 0.000000)"
freq,1,,45641,8285,,44609,,,,6214
mean,,26889.735104,,,13278.08,,1991.772189,-39.12258,61.074319,
std,,16860.68303,,,574988.9,,27.181247,46.378511,80.647298,
min,,1.0,,,0.0,,301.0,-87.36667,-165.43333,
25%,,12688.75,,,7.2,,1987.0,-76.71424,0.0,
50%,,24261.5,,,32.6,,1998.0,-71.5,35.66667,
75%,,40656.75,,,202.6,,2003.0,0.0,157.16667,


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         45716 non-null  object 
 1   id           45716 non-null  int64  
 2   nametype     45716 non-null  object 
 3   recclass     45716 non-null  object 
 4   mass         45585 non-null  float64
 5   fall         45716 non-null  object 
 6   year         45428 non-null  float64
 7   reclat       38401 non-null  float64
 8   reclong      38401 non-null  float64
 9   GeoLocation  38401 non-null  object 
dtypes: float64(4), int64(1), object(5)
memory usage: 3.5+ MB


In [9]:
df.dtypes

name            object
id               int64
nametype        object
recclass        object
mass           float64
fall            object
year           float64
reclat         float64
reclong        float64
GeoLocation     object
dtype: object

In [10]:
df.convert_dtypes().dtypes

name            string
id               Int64
nametype        string
recclass        string
mass           Float64
fall            string
year             Int64
reclat         Float64
reclong        Float64
GeoLocation     string
dtype: object

In [11]:
df.nunique()

name           45716
id             45716
nametype           2
recclass         466
mass           12576
fall               2
year             268
reclat         12738
reclong        14640
GeoLocation    17100
dtype: int64

In [13]:
df[['fall', 'nametype']]

Unnamed: 0,fall,nametype
0,Fell,Valid
1,Fell,Valid
2,Fell,Valid
3,Fell,Valid
4,Fell,Valid
...,...,...
45711,Found,Valid
45712,Found,Valid
45713,Found,Valid
45714,Found,Valid


In [16]:
df[['fall', 'nametype']] = df[['fall', 'nametype']].astype('category')
df.dtypes

name             object
id                int64
nametype       category
recclass         object
mass            float64
fall           category
year            float64
reclat          float64
reclong         float64
GeoLocation      object
dtype: object

In [17]:
df['fall'].unique()

['Fell', 'Found']
Categories (2, object): ['Fell', 'Found']

In [18]:
df['fall'].value_counts()

Found    44609
Fell      1107
Name: fall, dtype: int64

In [19]:
# Convertir variable a variable dummie
pd.get_dummies(df['fall'])

Unnamed: 0,Fell,Found
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
45711,0,1
45712,0,1
45713,0,1
45714,0,1


In [21]:
df[['fell', 'found']] = pd.get_dummies(df['fall'])
df

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation,fell,found
0,Aachen,1,Valid,L5,21.0,Fell,1880.0,50.77500,6.08333,"(50.775000, 6.083330)",1,0
1,Aarhus,2,Valid,H6,720.0,Fell,1951.0,56.18333,10.23333,"(56.183330, 10.233330)",1,0
2,Abee,6,Valid,EH4,107000.0,Fell,1952.0,54.21667,-113.00000,"(54.216670, -113.000000)",1,0
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976.0,16.88333,-99.90000,"(16.883330, -99.900000)",1,0
4,Achiras,370,Valid,L6,780.0,Fell,1902.0,-33.16667,-64.95000,"(-33.166670, -64.950000)",1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
45711,Zillah 002,31356,Valid,Eucrite,172.0,Found,1990.0,29.03700,17.01850,"(29.037000, 17.018500)",0,1
45712,Zinder,30409,Valid,"Pallasite, ungrouped",46.0,Found,1999.0,13.78333,8.96667,"(13.783330, 8.966670)",0,1
45713,Zlin,30410,Valid,H4,3.3,Found,1939.0,49.25000,17.66667,"(49.250000, 17.666670)",0,1
45714,Zubkovsky,31357,Valid,L6,2167.0,Found,2003.0,49.78917,41.50460,"(49.789170, 41.504600)",0,1


## Borrar filas y columnas 

In [23]:
# Eliminar una columna
df.drop(['fell'], axis=1)

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation,found
0,Aachen,1,Valid,L5,21.0,Fell,1880.0,50.77500,6.08333,"(50.775000, 6.083330)",0
1,Aarhus,2,Valid,H6,720.0,Fell,1951.0,56.18333,10.23333,"(56.183330, 10.233330)",0
2,Abee,6,Valid,EH4,107000.0,Fell,1952.0,54.21667,-113.00000,"(54.216670, -113.000000)",0
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976.0,16.88333,-99.90000,"(16.883330, -99.900000)",0
4,Achiras,370,Valid,L6,780.0,Fell,1902.0,-33.16667,-64.95000,"(-33.166670, -64.950000)",0
...,...,...,...,...,...,...,...,...,...,...,...
45711,Zillah 002,31356,Valid,Eucrite,172.0,Found,1990.0,29.03700,17.01850,"(29.037000, 17.018500)",1
45712,Zinder,30409,Valid,"Pallasite, ungrouped",46.0,Found,1999.0,13.78333,8.96667,"(13.783330, 8.966670)",1
45713,Zlin,30410,Valid,H4,3.3,Found,1939.0,49.25000,17.66667,"(49.250000, 17.666670)",1
45714,Zubkovsky,31357,Valid,L6,2167.0,Found,2003.0,49.78917,41.50460,"(49.789170, 41.504600)",1


In [25]:
df.drop([0,2,4,6])

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation,fell,found
1,Aarhus,2,Valid,H6,720.0,Fell,1951.0,56.18333,10.23333,"(56.183330, 10.233330)",1,0
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976.0,16.88333,-99.90000,"(16.883330, -99.900000)",1,0
5,Adhi Kot,379,Valid,EH4,4239.0,Fell,1919.0,32.10000,71.80000,"(32.100000, 71.800000)",1,0
7,Agen,392,Valid,H5,30000.0,Fell,1814.0,44.21667,0.61667,"(44.216670, 0.616670)",1,0
8,Aguada,398,Valid,L6,1620.0,Fell,1930.0,-31.60000,-65.23333,"(-31.600000, -65.233330)",1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
45711,Zillah 002,31356,Valid,Eucrite,172.0,Found,1990.0,29.03700,17.01850,"(29.037000, 17.018500)",0,1
45712,Zinder,30409,Valid,"Pallasite, ungrouped",46.0,Found,1999.0,13.78333,8.96667,"(13.783330, 8.966670)",0,1
45713,Zlin,30410,Valid,H4,3.3,Found,1939.0,49.25000,17.66667,"(49.250000, 17.666670)",0,1
45714,Zubkovsky,31357,Valid,L6,2167.0,Found,2003.0,49.78917,41.50460,"(49.789170, 41.504600)",0,1


In [27]:
# Copiar el dataframe completo
df1 = df.copy(deep=True)
df1

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation,fell,found
0,Aachen,1,Valid,L5,21.0,Fell,1880.0,50.77500,6.08333,"(50.775000, 6.083330)",1,0
1,Aarhus,2,Valid,H6,720.0,Fell,1951.0,56.18333,10.23333,"(56.183330, 10.233330)",1,0
2,Abee,6,Valid,EH4,107000.0,Fell,1952.0,54.21667,-113.00000,"(54.216670, -113.000000)",1,0
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976.0,16.88333,-99.90000,"(16.883330, -99.900000)",1,0
4,Achiras,370,Valid,L6,780.0,Fell,1902.0,-33.16667,-64.95000,"(-33.166670, -64.950000)",1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
45711,Zillah 002,31356,Valid,Eucrite,172.0,Found,1990.0,29.03700,17.01850,"(29.037000, 17.018500)",0,1
45712,Zinder,30409,Valid,"Pallasite, ungrouped",46.0,Found,1999.0,13.78333,8.96667,"(13.783330, 8.966670)",0,1
45713,Zlin,30410,Valid,H4,3.3,Found,1939.0,49.25000,17.66667,"(49.250000, 17.666670)",0,1
45714,Zubkovsky,31357,Valid,L6,2167.0,Found,2003.0,49.78917,41.50460,"(49.789170, 41.504600)",0,1
