# Importación de un dataset en Google Colab desde diferentes orígenes de datos

In [1]:
# Hago las importaciones al principio ya que las utlizaré en cada una de las secciones
import pandas as pd

**Nota:** Voy a crear 3 dataframes con nombres diferentes, para poder comprobar que todos los tipos de importación funcionan.

## 1 - Importación del dataset origen local

In [2]:
from google.colab import files
uploaded = files.upload()

Saving insurance.csv to insurance.csv


In [3]:
insurance_data_1 = pd.read_csv('insurance.csv')

In [4]:
insurance_data_1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## 2 - Importación del dataset desde Drive

In [5]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [6]:
file_id = '1-2qlh9BO01Sx0FtL8VUNPOLttrCFJCil'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('insurance2.csv')

In [7]:
insurance_data_2 = pd.read_csv('insurance2.csv')

In [8]:
insurance_data_2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## 3 - Importación del dataset desde Kaggle

In [9]:
from google.colab import files
!pip install -q kaggle

In [10]:
# Upload del kaggle.json
kaggle_json = files.upload()

Saving kaggle.json to kaggle.json


In [14]:
# Movemos fichero a la ubicación correcta
!mv kaggle.json /root/.kaggle/kaggle.json

In [15]:
# Download dataset
!kaggle datasets download -d mirichoi0218/insurance

Downloading insurance.zip to /content
  0% 0.00/16.0k [00:00<?, ?B/s]
100% 16.0k/16.0k [00:00<00:00, 12.0MB/s]


In [16]:
!unzip insurance.zip

Archive:  insurance.zip
replace insurance.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: insurance.csv           


In [17]:
data = pd.read_csv('./insurance.csv')

## 4 - Análisis exploratorio del dataset

Para el análisis exploratorio, utilizo el último dataframe creado

In [18]:
# Exploración de las primeras filas del dataset
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [19]:
# Datos estadísticos
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
bmi,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
children,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
charges,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


In [20]:
# Dimensiones del dataframe
data.shape

(1338, 7)

In [21]:
# Valores nulos
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [22]:
# Nombre de las columnas
data.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [23]:
# Tipos de datos en columnas
data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object