In [2]:
!ls ../input/automlwrapper

automlwrapper.py


In [3]:
import sys
sys.path.insert(0, '../input/automlwrapper')

In [10]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
from datetime import datetime

from sklearn.model_selection import train_test_split

from google.cloud import storage
from google.cloud import automl_v1beta1 as automl

from automlwrapper import AutoMLWrapper

In [11]:
# Set your own values for these. bucket_name should be the project_id + '-lcm'.
PROJECT_ID = 'kaggle-disaster-tweets'
bucket_name = f'{PROJECT_ID}-lcm'

region = 'us-central1' # Region must be us-central1
dataset_display_name = 'kaggle_tweets'
model_display_name = 'kaggle_tweets_model1'

storage_client = storage.Client(project=PROJECT_ID)
client = automl.AutoMlClient()

In [12]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

../input/.DS_Store
../input/nlp-getting-started/test.csv
../input/nlp-getting-started/train.csv
../input/nlp-getting-started/sample_submission.csv
../input/automlwrapper/automlwrapper.py
../input/automlwrapper/__pycache__/automlwrapper.cpython-36.pyc


In [13]:
nlp_train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
nlp_test_df = pd.read_csv('../input/nlp-getting-started/test.csv')
def callback(operation_future):
    result = operation_future.result()

In [14]:
nlp_train_df.tail()

Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1
7612,10873,,,The Latest: More Homes Razed by Northern Calif...,1


### Data spelunking

#### How often does 'fire' come up in this dataset?

In [15]:
nlp_train_df.loc[nlp_train_df['text'].str.contains('fire', na=False, case=False)]

Unnamed: 0,id,keyword,location,text,target
1,4,,,Forest fire near La Ronge Sask. Canada,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
...,...,...,...,...,...
7427,10625,wounded,,Officer wounded suspect killed in exchange of ...,1
7433,10631,wounded,Yogya Berhati Nyaman,@wocowae Officer Wounded Suspect Killed in Exc...,1
7604,10863,,,#WorldNews Fallen powerlines on G:link tram: U...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1


#### Does the presence of the word 'fire' help determine whether the tweets here are real or false?

In [16]:
nlp_train_df.loc[nlp_train_df['text'].str.contains('fire', na=False, case=False)].target.value_counts()

1    344
0    129
Name: target, dtype: int64

In [17]:
nlp_train_df.loc[(nlp_train_df['text'].str.contains('fire', na=False, case=False)) & (nlp_train_df['target'] == 0)]

Unnamed: 0,id,keyword,location,text,target
130,187,aftershock,,@OnFireAnders I love you bb,0
444,643,arsonist,,Trusting Iran to stop terrorism is like inviti...,0
613,884,bioterrorism,,Firepower in the lab [electronic resource] : a...,0
678,979,blazing,"Dallas, TX",Bright &amp; BLAZING Fireman Birthday Party ht...,0
758,1094,blew%20up,?205?478?,Max blew tf up ! ?????? shots fired ???? #Catf...,0
...,...,...,...,...,...
7319,10478,wild%20fires,Indiana,'Your love will surely come find us\nLike blaz...,0
7322,10482,wild%20fires,Canada,@WBCShirl2 Yes God doessnt change he says not...,0
7330,10490,wildfire,Vail Valley,We should all have a fire safety plan. RT @Mat...,0
7332,10496,wildfire,Australia,NowPlaying GT &amp; Wildfire feat. Freaks In L...,0


### GCS upload/download utilities

#### These functions make upload and download of files from the kernel to Google Cloud Storage easier. This is needed for AutoML

In [18]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket. https://cloud.google.com/storage/docs/ """
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print('File {} uploaded to {}'.format(
        source_file_name,
        'gs://' + bucket_name + '/' + destination_blob_name))
    
def download_to_kaggle(bucket_name, destination_directory, file_name, prefix=None):
    """Takes the data from your GCS Bucket and puts it into the working directory of your Kaggle notebook"""
    os.makedirs(destination_directory, exist_ok = True)
    full_file_path = os.path.join(destination_directory, file_name)
    blobs = storage_client.list_blobs(bucket_name,prefix=prefix)
    for blob in blobs:
        blob.download_to_filename(full_file_path)

In [19]:
bucket = storage.Bucket(storage_client, name=bucket_name)
if not bucket.exists():
    bucket.create(location=region)

TransportError: HTTPSConnectionPool(host='oauth2.googleapis.com', port=443): Max retries exceeded with url: /token (Caused by SSLError(SSLError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:852)'),))

### Export to CSV and upload to GCS

In [None]:
# Select the text body and the target value, for sending to AutoML NL
nlp_train_df[['text','target']].to_csv('train.csv', index=False, header=False) 

In [None]:
nlp_train_df[['id','text','target']].head()

In [None]:
training_gcs_path = 'uploads/kaggle_getstarted/full_train.csv'
upload_blob(bucket_name, 'train.csv', training_gcs_path)

## Create our class instance

In [None]:
amw = AutoMLWrapper(client=client, 
                    project_id=PROJECT_ID, 
                    bucket_name=bucket_name, 
                    region=region, 
                    dataset_display_name=dataset_display_name, 
                    model_display_name=model_display_name)

## Create (or retreive) dataset

### Check to see if this dataset already exists. If not, create it

In [None]:
if not amw.get_dataset_by_display_name(dataset_display_name):
    print('dataset not found')
    amw.create_dataset()
    amw.import_gcs_data(training_gcs_path)

amw.dataset

## Kick off the training for the model

### And retrieve the training info after completion. Start model deployment.

In [None]:
if not amw.get_model_by_display_name():
    amw.train_model()
amw.deploy_model()
amw.model

In [None]:
amw.model_full_path

## Prediction

#### Note that prediction will not run until deployment finishes, which takes a bit of time. However, once you have your model deployed, this notebook won't re-train the model, thanks to the various safeguards put in place. Instead, it will take the existing (trained) model and make predictions and generate the submission file.****

In [None]:
nlp_test_df.head()

In [None]:
# Create client for prediction service.
prediction_client = automl.PredictionServiceClient()
amw.set_prediction_client(prediction_client)

predictions_df = amw.get_predictions(nlp_test_df, 
                                     input_col_name='text', 
#                                      ground_truth_col_name='target', # we don't have ground truth in our test set
                                     limit=None, 
                                     threshold=0.5,
                                     verbose=False)

## (optional) Undeploy model

### Undeploy the model to stop charges

In [None]:
amw.undeploy_model()

In [None]:
predictions_df.head()

In [None]:
submission_df = pd.concat([nlp_test_df['id'], predictions_df['class']], axis=1)
submission_df.head()

In [None]:
submission_df = submission_df.rename(columns={'class':'target'})
submission_df.head()

## Submit predictions to the competition!

In [None]:
submission_df.to_csv("submission.csv", index=False, header=True)

In [None]:
! ls -l submission.csv