<a href="https://colab.research.google.com/github/salathegroup/vaccine-multi-lang/blob/master/Create_Vaccine_Datasets_from_Sheets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Create Vaccine Datasets from Sheets
Creates the vaccine datasets both for the annotated and the unannotated stuff stored in the Google Sheets.

In the end it copies the training datasets to the Google Cloud.

In [103]:
#@markdown ##Read Sheets

import pandas as pd
import numpy as np
import gspread
import sys, os
from oauth2client.client import GoogleCredentials
from google.colab import auth

auth.authenticate_user()
gc = gspread.authorize(GoogleCredentials.get_application_default())

#Read the annotated data
sh = gc.open('EPFL_vaccine_sentiment_3fold_agreed')
worksheet = sh.worksheet("unique")
rows = worksheet.get_all_values()

#Get it into pandas
import pandas as pd
annotated = pd.DataFrame.from_records(rows)
annotated.columns = annotated.iloc[0]
annotated = annotated.reindex(annotated.index.drop(0))
annotated['a'] = "a"


print("Successfully read the annotated data")
#Read the unannotated data
sh = gc.open('EPFL unannotated tweets')
worksheet = sh.worksheet("raw")
rows = worksheet.get_all_values()

#Get it into pandas
unannotated = pd.DataFrame.from_records(rows)
unannotated.columns = unannotated.iloc[0]
unannotated = unannotated.reindex(unannotated.index.drop(0))
unannotated['a'] = "a"
unannotated = unannotated[0:10000]

print("Successfully read the unannotated data")


Successfully read the annotated data
Successfully read the unannotated data


In [138]:
#@markdown ##Create and Copy Data Files
import warnings
warnings.filterwarnings('ignore')

#annotated data
#Split into 60% train - 20% validation/development - 20% test
train   = annotated.loc[annotated["group"] == "train"]
train_small = train.sample(n=1000, random_state=42)

dev   = annotated.loc[annotated["group"] == "dev"]
test   = annotated.loc[annotated["group"] == "test"]

languages = ["en","fr","de","es","pt"]

if not os.path.exists('annotated'):
    os.makedirs('annotated')

for lang in languages:
  #Save annotated (large)
  if not os.path.exists('annotated/'+lang):
    os.makedirs('annotated/'+lang)
  
  with open('/content/annotated/'+lang+"/train.tsv", 'w') as f:
    f.write(train.to_csv(columns=["id", "label", "a", lang], header=False, index=False, sep='\t'))

  with open('/content/annotated/'+lang+"/dev.tsv", 'w') as f:
    f.write(dev.to_csv(columns=["id", "label", "a", lang], header=False, index=False, sep='\t'))

  with open('/content/annotated/'+lang+"/test.tsv", 'w') as f:
    f.write(test.to_csv(columns=["id", "label", "a", lang], header=False, index=False, sep='\t'))

  #Save annotated small
  if not os.path.exists('annotated_small/'+lang):
      os.makedirs('annotated_small/'+lang)
    
  with open('/content/annotated_small/'+lang+"/train.tsv", 'w') as f:
    f.write(train_small.to_csv(columns=["id", "label", "a", lang], header=False, index=False, sep='\t'))

  with open('/content/annotated_small/'+lang+"/dev.tsv", 'w') as f:
    f.write(dev.to_csv(columns=["id", "label", "a", lang], header=False, index=False, sep='\t'))

  with open('/content/annotated_small/'+lang+"/test.tsv", 'w') as f:
    f.write(test.to_csv(columns=["id", "label", "a", lang], header=False, index=False, sep='\t'))

#unannotated data
languages = ["en","fr","de","es"]

if not os.path.exists('unannotated'):
    os.makedirs('unannotated')

for lang in languages:  
  with open('/content/unannotated/'+lang+".tsv", 'w') as f:
    f.write(train.to_csv(columns=["id", lang], header=False, index=False, sep='\t'))

print("Created the data files locally")


#Copy everything to the bucket
!gsutil -m cp -r *annotated*/ gs://perepublic/EPFL_multilang/

print("Copied the data to the bucket")


Created the data files locally
Copying file://annotated/pt/dev.tsv [Content-Type=text/tab-separated-values]...
Copying file://annotated/pt/test.tsv [Content-Type=text/tab-separated-values]...
Copying file://annotated/en/train.tsv [Content-Type=text/tab-separated-values]...
Copying file://annotated/de/test.tsv [Content-Type=text/tab-separated-values]...
Copying file://annotated/de/dev.tsv [Content-Type=text/tab-separated-values]...
Copying file://annotated/pt/train.tsv [Content-Type=text/tab-separated-values]...
Copying file://annotated/de/train.tsv [Content-Type=text/tab-separated-values]...
Copying file://annotated/en/test.tsv [Content-Type=text/tab-separated-values]...
Copying file://annotated/en/dev.tsv [Content-Type=text/tab-separated-values]...
Copying file://annotated/es/dev.tsv [Content-Type=text/tab-separated-values]...
Copying file://annotated/es/test.tsv [Content-Type=text/tab-separated-values]...
Copying file://annotated/es/train.tsv [Content-Type=text/tab-separated-values].