# Autenticação
Primeiro precisamos autenticar nossa sessão do Colab no Google e definimos o ID do projeto

In [0]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [0]:
#Informe o id do projeto nesta linha:

project_id = "workshop-5ia-tensorflow-253922" #@param {type:"string"}


# Client Big Query

Como vamos buscar dados no bigquery, é necessário criar um cliente BG

In [0]:
# Call BigQuery and examine in dataframe
from google.cloud import bigquery

bgclient = bigquery.Client(project=project_id)

Demais imports:

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns, numpy as np


#Obtenção dos dados

Como vimos, temos 137.826.763 de observações na base.

Mesmo considerando apenas os nascimentos a partir de 2001, teremos 33.271.914 observações.

Vamos faça um comando para obter apenas 0,1% porcento do registros (ou seja, **aproximadamente** 33.3 mil observações)

In [0]:
query = """
SELECT
  weight_pounds * 0.45359237 AS weight_kilos,
  is_male,
  mother_age,
  mother_married,
  plurality,
  gestation_weeks,
  CONCAT(CAST(YEAR AS STRING), CAST(month AS STRING)) as year_and_month,
  FARM_FINGERPRINT('ELTHON') AS MeuHash,  
  ABS(FARM_FINGERPRINT(CONCAT(CAST(YEAR AS STRING), CAST(month AS STRING)))) AS hashmonth,
  RAND() AS meurand
FROM
  publicdata.samples.natality
WHERE year > 2000 AND RAND() <= 0.001
 ORDER BY meurand 
"""

In [0]:
df = bgclient.query(query).to_dataframe()
df.shape

(33218, 10)

In [0]:
df.head()

# Pré-Processamento

Aparentemente, há observações com NaN ou ainda como valores igual a zero. Ajuste o dataset para que isso não "polua" nossa análise

In [0]:
df.isnull().sum()

weight_kilos        36
is_male              0
mother_age           0
mother_married       0
plurality            0
gestation_weeks    223
year_and_month       0
MeuHash              0
hashmonth            0
meurand              0
dtype: int64

In [0]:
df.dropna(axis='rows',how='any', inplace=True)

In [0]:
df.isnull().sum()

weight_kilos       0
is_male            0
mother_age         0
mother_married     0
plurality          0
gestation_weeks    0
year_and_month     0
MeuHash            0
hashmonth          0
meurand            0
dtype: int64

In [0]:
df[['is_male', 'mother_age', 'mother_married', 'plurality', 'gestation_weeks']].head()

Unnamed: 0,is_male,mother_age,mother_married,plurality,gestation_weeks
0,True,36,False,1,29.0
1,False,16,False,1,39.0
2,True,23,False,1,39.0
3,True,25,True,1,38.0
4,True,33,True,1,40.0


In [0]:
df.columns

Index(['weight_kilos', 'is_male', 'mother_age', 'mother_married', 'plurality',
       'gestation_weeks', 'year_and_month', 'MeuHash', 'hashmonth', 'meurand'],
      dtype='object')

# Spliting em treino e teste

Divida o dataset em 75% treino e 25% testes.
Mantenha as colunas 

```
'weight_kilos', 'is_male', 'mother_age', 'mother_married', 'plurality', 'gestation_weeks', 'hashmonth'
```

In [0]:
df.shape

(32967, 10)

In [0]:
# Duan
df_treino = df[['weight_kilos', 'is_male', 'mother_age', 'mother_married', 'plurality', 'gestation_weeks', 'hashmonth']][:round(df.shape[0]*0.75)]
df_teste  = df[['weight_kilos', 'is_male', 'mother_age', 'mother_married', 'plurality', 'gestation_weeks', 'hashmonth']][round(df.shape[0]*0.75):]

# Elthon
train_row = np.random.rand(len(df)) <= 0.75

# Exemplo
teste = np.random.rand(8)
print(teste)
print(teste <= 0.75)

In [0]:
print(df_treino.shape, df_teste.shape)

(24725, 7) (8242, 7)


In [0]:
print((df_treino.shape[0]+df_teste.shape[0])==df.shape[0])

True


#Export dos dados para serem processados pelo Tensorflow


Grave o conteúdo em dois datasets, treino.csv e teste.csv, sem índice de linhas ou header de colunas

In [0]:
# Duan
df_treino.to_csv('treino.csv', index=False, header=False)
df_teste.to_csv('teste.csv', index=False, header=False)

# Elthon
df_treino.to_csv('treino.csv', index=False, header=False)
df_teste.to_csv('teste.csv', index=False, header=False)

Verifica os arquicos salvos:

In [0]:
%%bash
wc -l *.csv
head *.csv
tail *.csv

   8242 teste.csv
  24725 treino.csv
  32967 total
==> teste.csv <==
3.9689999966716343,False,33,True,1,41.0,4979697502521811334
1.6399999986247114,False,21,False,1,35.0,3095933535584005890
3.4299999971236343,True,29,True,1,39.0,1305143018446161857
2.437999997955516,False,31,True,1,33.0,7872612453343038854
2.947999997527835,True,23,True,1,39.0,1622638268154624360
3.4589999970993155,False,27,True,1,40.0,4979697502521811334
3.28499999724523,False,28,True,1,36.0,2363238223526193234
3.3399999971991075,True,21,True,1,40.0,8391424625589759186
2.4099999979789968,False,25,True,1,38.0,7420272703711713305
3.4589999970993155,True,36,True,1,37.0,7445587375556638376

==> treino.csv <==
0.949999999203339,True,36,False,1,29.0,5934265245228309013
3.2099999973081244,False,16,False,1,39.0,5937540421097454372
3.741999996861994,True,23,False,1,39.0,1002950341933487066
2.063999998269149,True,25,True,1,38.0,5107972924983092617
3.7709999968376744,True,33,True,1,40.0,411066950820961322
3.8599999967630403,True

# Exporta para o Cloud Storage

In [0]:
!gsutil cp *.csv gs://workshop-5ia-tensorflow

Copying file://teste.csv [Content-Type=text/csv]...
Copying file://treino.csv [Content-Type=text/csv]...
/ [2 files][  1.9 MiB/  1.9 MiB]                                                
Operation completed over 2 objects/1.9 MiB.                                      
