In [38]:
import pandas as pd

# Set format display values in PD
pd.options.display.float_format = '{:,.1f}'.format

In [39]:
df_meteorites = pd.read_csv('../db/Meteorite_Landings.csv')
# Sample => escoge aletaroiamente un registro
df_meteorites.sample()

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation
2116,Allan Hills 85054,920,Valid,H5,55.3,Found,01/01/1985 12:00:00 AM,-76.8,156.5,"(-76.84514, 156.45805)"


In [40]:
# .shape
# retorna filas y columnas del archivo
print("df_meteorites.shape", df_meteorites.shape)

df_meteorites.shape (45716, 10)


In [41]:
# .describe()
# Resumenm estadistico
print(df_meteorites.describe())
print(df_meteorites.describe(include='all'))

# .info()
# .dtypes() // Informacion de tipo de columna
# Informacion de categoria y tipo
print(df_meteorites.info())
print(df_meteorites.dtypes)

            id     mass (g)   reclat  reclong
count 45,716.0     45,585.0 38,401.0 38,401.0
mean  26,889.7     13,278.1    -39.1     61.1
std   16,860.7    574,988.9     46.4     80.6
min        1.0          0.0    -87.4   -165.4
25%   12,688.8          7.2    -76.7      0.0
50%   24,261.5         32.6    -71.5     35.7
75%   40,656.8        202.6      0.0    157.2
max   57,458.0 60,000,000.0     81.2    354.5
            name       id nametype recclass     mass (g)   fall  \
count      45716 45,716.0    45716    45716     45,585.0  45716   
unique     45716      NaN        2      466          NaN      2   
top     Kamyshla      NaN    Valid       L6          NaN  Found   
freq           1      NaN    45641     8285          NaN  44609   
mean         NaN 26,889.7      NaN      NaN     13,278.1    NaN   
std          NaN 16,860.7      NaN      NaN    574,988.9    NaN   
min          NaN      1.0      NaN      NaN          0.0    NaN   
25%          NaN 12,688.8      NaN      NaN       

In [42]:
# .convert_dtypes().dtypes
# Trasnformacion de datos mas compresibles (Object => string)
df_meteorites.convert_dtypes().dtypes


name            string
id               Int64
nametype        string
recclass        string
mass (g)       Float64
fall            string
year            string
reclat         Float64
reclong        Float64
GeoLocation     string
dtype: object

In [43]:
# .nunique()
# Hace un distinc de cada valor unico sobre cada columna, retornando la cantidad de valores unicos
# Sirve para identificar rapidamente valores categoricos
print(df_meteorites.nunique())

name           45716
id             45716
nametype           2
recclass         466
mass (g)       12576
fall               2
year             266
reclat         12738
reclong        14640
GeoLocation    17100
dtype: int64


In [44]:
# .nunique()
# aplicado a una columna realiza, el distinc y muestra la cantidad de valores unicos
print(df_meteorites['fall'].nunique())

2


In [45]:
# .value_counts()
# realiza la agrupacion por valores unicos, mostrando la cantidad
print(df_meteorites['fall'].value_counts())

Found    44609
Fell      1107
Name: fall, dtype: int64


In [46]:
# Seteando variables string a variables categoricas, para mejorar el performance
df_meteorites[['fall', 'nametype']] = df_meteorites[['fall', 'nametype']].astype('category')
print(df_meteorites.dtypes)

name             object
id                int64
nametype       category
recclass         object
mass (g)        float64
fall           category
year             object
reclat          float64
reclong         float64
GeoLocation      object
dtype: object


In [47]:
# Seteando las fechas
# errors='coerce', // si al parsear genera error, colocara un nan
df_meteorites['year'] = pd.to_datetime(
    df_meteorites['year'],
    errors='coerce',
    format='%m/%d/%Y %H:%M:%S %p'
)
print(df_meteorites.dtypes)

name                   object
id                      int64
nametype             category
recclass               object
mass (g)              float64
fall                 category
year           datetime64[ns]
reclat                float64
reclong               float64
GeoLocation            object
dtype: object


In [48]:
# Renombrando una columna sin reasignar
# inplace=True // Reemplaza a la ves que se ejecuta
df_meteorites.rename(columns={'mass (g)': 'mass_update'}, inplace=True)
df_meteorites.sample()

Unnamed: 0,name,id,nametype,recclass,mass_update,fall,year,reclat,reclong,GeoLocation
8861,Dominion Range 08417,52155,Valid,L6,31.5,Found,2008-01-01 12:00:00,0.0,0.0,"(0.0, 0.0)"


In [49]:
# Creando una columna en un DF
df_meteorites['nueva_columna'] = 1
# Eliminando una columna
# axis=1 // 1 columna || 0 fila
df_meteorites.drop(['nueva_columna'], axis=1, inplace=True)
# especificando las columnas y filas a liminar
df_meteorites.drop(columns=['id', 'recclass'], index=[0, 2, 4, 6, 8], inplace=True)
df_meteorites.head()


Unnamed: 0,name,nametype,mass_update,fall,year,reclat,reclong,GeoLocation
1,Aarhus,Valid,720.0,Fell,1951-01-01 12:00:00,56.2,10.2,"(56.18333, 10.23333)"
3,Acapulco,Valid,1914.0,Fell,1976-01-01 12:00:00,16.9,-99.9,"(16.88333, -99.9)"
5,Adhi Kot,Valid,4239.0,Fell,1919-01-01 12:00:00,32.1,71.8,"(32.1, 71.8)"
7,Agen,Valid,30000.0,Fell,1814-01-01 12:00:00,44.2,0.6,"(44.21667, 0.61667)"
9,Aguila Blanca,Valid,1440.0,Fell,1920-01-01 12:00:00,-30.9,-64.5,"(-30.86667, -64.55)"


In [50]:
# Eliminando los registros con nan
df_meteorites['mass_update'] = df_meteorites['mass_update'].dropna()
df_meteorites.head()


Unnamed: 0,name,nametype,mass_update,fall,year,reclat,reclong,GeoLocation
1,Aarhus,Valid,720.0,Fell,1951-01-01 12:00:00,56.2,10.2,"(56.18333, 10.23333)"
3,Acapulco,Valid,1914.0,Fell,1976-01-01 12:00:00,16.9,-99.9,"(16.88333, -99.9)"
5,Adhi Kot,Valid,4239.0,Fell,1919-01-01 12:00:00,32.1,71.8,"(32.1, 71.8)"
7,Agen,Valid,30000.0,Fell,1814-01-01 12:00:00,44.2,0.6,"(44.21667, 0.61667)"
9,Aguila Blanca,Valid,1440.0,Fell,1920-01-01 12:00:00,-30.9,-64.5,"(-30.86667, -64.55)"


In [51]:
# Haciendo una copia full del df
copy_of_meteorites = df_meteorites.copy(deep=True)
copy_of_meteorites.sample()

Unnamed: 0,name,nametype,mass_update,fall,year,reclat,reclong,GeoLocation
32152,Queen Alexandra Range 02178,Valid,0.6,Found,2002-01-01 12:00:00,-84.0,168.0,"(-84.0, 168.0)"
