### Setup direnv & .env

In [None]:
# MacOS
brew install direnv

# Ubuntu (Linux or Windows WSL2)
sudo apt update
sudo apt install -y direnv

In [None]:
# Create .env
touch .env

# Then add your environments variables in .env file
'''
# GCP Project
GCP_PROJECT=<project_id_you_want>
GCP_REGION=europe-west1

# Cloud Storage
BUCKET_NAME=nlp-shades-<your_github_name>

# BigQuery
BQ_REGION=EU
BQ_DATASET=nlp_shades # must be letters (uppercase or lowercase), numbers, and underscores up to 1024 characters.
DATA_SIZE=10k # 300k, 450k, all
LOCAL_DATA_PATH="<path/to/the/package/model/dir>"

# Compute Engine
INSTANCE=instance-nlp-shades-<your_github_name>

# Docker
GCR_IMAGE=nlp-shades-api
'''

# Create .envrc to load the environment variables
touch .envrc
echo dotenv  >> .envrcenv

# Allow to load the .envrc
direnv allow

# ⚠️⚠️ If you update the .env file ⚠️⚠️
direnv reload .

### Create GCP Service account

1- Create a GCP account
[here](https://github.com/lewagon/data-setup/blob/master/VM.md#google-cloud-platform-setup)

2- Create a service account
[here](https://github.com/lewagon/data-setup/blob/master/VM.md#google-cloud-cli)

In [None]:
# Check your service account
gcloud iam service-accounts list
'''
DISPLAY NAME                            EMAIL                                                              DISABLED
Compute Engine default service account  161307270588-compute@developer.gserviceaccount.com                 False
Compte de service (Wagon Bootcamp)      compte-de-service-wagon-bootca@PROJECT_ID.iam.gserviceaccount.com  False
'''
# ⚠️⚠️  If you are not on the right project ⚠️⚠️ 
gcloud config set project YOUR_PROJECT_ID

# Then
gcloud iam service-accounts list

### Cloud Storage (create a bucket  to save the model)

In [None]:
# Create a bucket (from variables in .env)
gsutil mb -l $GCP_REGION -p $GCP_PROJECT gs://$BUCKET_NAME

# Then set gcloud on the correct GCP project ID
gcloud config set project $GCP_PROJECT

# And check 
gsutil ls
'''--> gs://YOUR BUCKET NAME/'''

# You can also check from the Google Cloud Console with the Cloud Storage module

### Big Query (store the data)

In [None]:
# Create dataset where we’ll store & query preprocessed data !
bq mk --project_id $GCP_PROJECT --data_location $BQ_REGION -d $BQ_DATASET

# Then check par acquis de conscience
bq show $BQ_DATASET

# Finally create N new tables you want for train
bq mk --location=$GCP_REGION $BQ_DATASET.raw_10k # small test
bq mk --location=$GCP_REGION $BQ_DATASET.raw_300k  # 50% of the dataset
bq mk --location=$GCP_REGION $BQ_DATASET.raw_450k  # 75% of the dataset
bq mk --location=$GCP_REGION $BQ_DATASET.raw_all # all

# Then :)
bq show $BQ_DATASET.raw_10k
bq show $BQ_DATASET.raw_300k
bq show $BQ_DATASET.raw_450k
bq show $BQ_DATASET.raw_all

# You can also check from the Google Cloud Console with the BigQuerry module

# 1- First load the data on Big Query (cf. data.py)
make load_10k_data_to_bq
make load_300k_data_to_bq
make load_450k_data_to_bq
make load_all_data_to_bq

# 1bis- Go on Big Query to check the tables

# 2- Run locally the model you want with the sample you want (cf. main.py) with either the csv file or by querying Big Query
make run_ner_on_10k  # or
make run_ner_on_300k # or
make run_ner_on_450k # or
make run_ner_on_all

### Create The Virtual Machine

1- GO 
[HERE](https://github.com/lewagon/data-setup/blob/master/VM.md#virtual-machine-vm)

2- If you know exactly what type of VM you want to create, run the following commands:

In [None]:
INSTANCE=project-instance
IMAGE_PROJECT=ubuntu-os-cloud
IMAGE_FAMILY=ubuntu-2204-lts

gcloud compute instances create $INSTANCE --image-project=$IMAGE_PROJECT --image-family=$IMAGE_FAMILY

### Setup The Virtual Machine

In [None]:
# Install zsh and omz 
sudo apt update
sudo apt install -y zsh
sh -c "$(curl -fsSL https://raw.github.com/ohmyzsh/ohmyzsh/master/tools/install.sh)"

# Install pyenv and pyenv-virtualenv
git clone https://github.com/pyenv/pyenv.git ~/.pyenv
git clone https://github.com/pyenv/pyenv-virtualenv.git ~/.pyenv/plugins/pyenv-virtualenv

# Open ~/.zshrc in a Terminal code editor
nano ~/.zshrc
'''
Add pyenv, ssh-agent and direnv to the list of zsh plugins on the line with plugins=(git) in ~/.zshrc: in the end, you should have plugins=(git pyenv ssh-agent direnv). Then, exit and save (Ctrl + X, Y, Enter).
'''

# Make sure that the modifications were indeed saved
cat ~/.zshrc | grep "plugins="

# Add the pyenv initialization script to your ~/.zprofile
cat << EOF >> ~/.zprofile
export PYENV_ROOT="\$HOME/.pyenv"
export PATH="\$PYENV_ROOT/bin:\$PATH"
eval "\$(pyenv init --path)"
EOF

# Install Python
sudo apt-get update; sudo apt-get install make build-essential libssl-dev zlib1g-dev \
libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm \
libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev \
python3-dev

# Now we need to start a new user session so that the updates in ~/.zshrc and ~/.zprofile are taken into account
zsh --login

# Install the same python version that you use for the project
pyenv install 3.10.6
pyenv global 3.10.6
pyenv virtualenv 3.10.6 nlp-shades # ⚠️ don't forget to update it
pyenv global nlp-shades # ⚠️ don't forget to update it

# Github auth
# ⚠️ Run this single command on YOUR machine, NOT in the VM ⚠️
gcloud compute scp ~/.ssh/id_ed25519 $USER@$INSTANCE:~/.ssh/
'''
Check that $USER in your machine is the same that in your VM, it may be different ...
If not, replace it with the $USER displayed on your VM !!

gcloud compute scp ~/.ssh/id_ed25519 <user_from_VM>@$INSTANCE:~/.ssh/
'''

# ⚠️ Then, resume running commands in the VM ⚠️
eval "$(ssh-agent -s)"
ssh-add ~/.ssh/id_ed25519
'''
It should display:

Agent pid 18827
Identity added: /home/<user_from_VM>/.ssh/id_ed25519 (<your_email>)
'''

# Python code authentication to GCP
'''
The code of your package needs to be able to access your Big Query data warehouse.
To do so, we will login to your account using the command below
'''
gcloud auth application-default login
'''
It should display a link to authenticate to GCP, copy/paste the link in your browser and authorize the acces. Then copy/paste the authorization code in the VM console.
'''

# Let’s verify that your Python code can now access your GCP resources. First, install some packages
pip install -U pip
pip install google-cloud-storage

# Then, run Python code from the CLI. This should list your GCP buckets:
python -c "from google.cloud import storage; \
    buckets = storage.Client().list_buckets(); \
    [print(b.name) for b in buckets]"

# Let’s run a few tests inside your VM Terminal before we install it
'''
Default shell is /usr/bin/zsh
'''
echo $SHELL
'''
Python version is [version_installed]
'''
python --version
'''
Active GCP project is the same as $GCP_PROJECT in your .env file
'''
gcloud config list project


# Your VM is now a data science beast 🔥 :) :)

### Train in the Cloud

In [None]:
'''
First, you have to clone your package, install its requirements
'''

# You can copy your code to the VM by cloning your GitHub project
git clone git@github.com:ekmillenium/nlp_shades_of_movie_reviews.git # ⚠️ don't forget to update it with the right project

# Enter the directory
cd <path/to/the/package/model/dir>

# Create a .env file with all required parameters to use your package
touch .env

# Then updte the .env file with your parameters
cat > .env 
'''# GCP Project'''
GCP_PROJECT=<project_id_you_want>
GCP_REGION=europe-west1
'''# Cloud Storage'''
BUCKET_NAME=nlp-shades-<your_github_name>
'''# BigQuery'''
BQ_REGION=EU
BQ_DATASET=nlp_shades 
DATA_SIZE=10k
LOCAL_DATA_PATH="<path/to/the/package/model/dir>"
'''# Compute Engine'''
INSTANCE=instance-nlp-shades-<your_github_name>
'''# Docker'''
GCR_IMAGE=nlp-shades-api

# Install direnv to load your .env
sudo apt update
sudo apt install -y direnv

# Reconnect (simulate a user reboot) so that direnv works
zsh --login

# Allow your .envrc
direnv allow .

# Install the project package
pip install .

# Finally have fun !! :) :)
make <the_command_you_want>

## ⚠️⚠️ Switch OFF your VM to finish ⚠️⚠️ ##
gcloud compute instances stop $INSTANCE
gcloud compute instances list
gcloud compute instances start $INSTANCE

### Créer un service (alternative à Docker)

In [None]:
# Se connecter à la VM depuis la console (ou démarer la sessio via Gcloud)
gcloud compute ssh

# Etapes (créer un service sur un port custom à la manière de Dockerfile CMD)
# --> SUR LA VM

# 1- checker les ervices
systemctl status system.slice
# 2- installer service
suod apt install uvicorn
# 3- Créer le service dans /etc/systemd/system
sudo touch nlp-shades-api.service
# 4- Update le service
sudo nano nlp-shades-api.service
'''
[Unit]
Description=api for nlp-shades project
After=network.target

[Service]
Type=simple
User=a_dhuy
ExecStart=sudo uvicorn nlp_shades.interface.api:app --reload
Slice=nlp_shades.slice
'''

# 5- Reload tous les files de config
sudo systemctl daemon-reload