<a href="https://colab.research.google.com/github/salathegroup/multi-lang-vaccine-sentiment/blob/master/Create_Vaccine_Datasets_from_Sheets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Create Vaccine Datasets from Sheets
Creates the vaccine datasets both for the annotated and the unannotated stuff stored in the Google Sheets.

In the end it copies the training datasets to the Google Cloud.

In [2]:
#@markdown ##Read Sheets
!pip install pandas_ml

import pandas as pd
import pandas_ml as pdml
import numpy as np
import gspread
import sys, os
from oauth2client.client import GoogleCredentials
from google.colab import auth

auth.authenticate_user()
gc = gspread.authorize(GoogleCredentials.get_application_default())

#Read the annotated data
sh = gc.open('EPFL_vaccine_sentiment_3fold_agreed')
worksheet = sh.worksheet("unique")
rows = worksheet.get_all_values()

#Get it into pandas
import pandas as pd
annotated = pd.DataFrame.from_records(rows)
annotated.columns = annotated.iloc[0]
annotated = annotated.reindex(annotated.index.drop(0))
annotated['a'] = "a"


print("Successfully read the annotated data")
#Read the unannotated data
sh = gc.open('EPFL unannotated tweets')
worksheet = sh.worksheet("raw")
rows = worksheet.get_all_values()

#Get it into pandas
unannotated = pd.DataFrame.from_records(rows)
unannotated.columns = unannotated.iloc[0]
unannotated = unannotated.reindex(unannotated.index.drop(0))
unannotated['a'] = "a"
unannotated = unannotated[0:10000]

print("Successfully read the unannotated data")


Collecting pandas_ml
[?25l  Downloading https://files.pythonhosted.org/packages/ac/69/f63b234546e39558e8121980daaf7389e52554a608da50005f52dc14f53f/pandas_ml-0.6.1.tar.gz (76kB)
[K     |████▎                           | 10kB 16.4MB/s eta 0:00:01[K     |████████▌                       | 20kB 1.8MB/s eta 0:00:01[K     |████████████▉                   | 30kB 2.6MB/s eta 0:00:01[K     |█████████████████               | 40kB 1.7MB/s eta 0:00:01[K     |█████████████████████▍          | 51kB 2.1MB/s eta 0:00:01[K     |█████████████████████████▋      | 61kB 2.6MB/s eta 0:00:01[K     |██████████████████████████████  | 71kB 3.0MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 2.6MB/s 
Building wheels for collected packages: pandas-ml
  Building wheel for pandas-ml (setup.py) ... [?25l[?25hdone
  Created wheel for pandas-ml: filename=pandas_ml-0.6.1-cp27-none-any.whl size=99434 sha256=e12adc6de78cbd92fd14a7aa1c527bdeb76a5157c97c35b00f9e96676c2c0068
  Stored in direc

In [3]:
# #@markdown ##Create and Copy Data Files
import warnings
warnings.filterwarnings('ignore')

#annotated data
#Split into 60% train - 20% validation/development - 20% test
train   = annotated.loc[annotated["group"] == "train"]
train_small = train.sample(n=1000, random_state=42)

dev   = annotated.loc[annotated["group"] == "dev"]
test   = annotated.loc[annotated["group"] == "test"]

languages = ["cb-annot-en","cb-annot-en-fr","cb-annot-en-de","cb-annot-en-es","cb-annot-en-pt"]

if not os.path.exists('annotated'):
    os.makedirs('annotated')

if not os.path.exists('cb-annot-en-de-fr-es'):
    os.makedirs('cb-annot-en-de-fr-es')

if not os.path.exists('cb-annot-en-de-fr-es-sm'):
    os.makedirs('cb-annot-en-de-fr-es-sm')


for lang in languages:
  #Save annotated (large)
  if not os.path.exists(lang):
    os.makedirs(lang)
  
  with open('/content/'+lang+"/train.tsv", 'w') as f:
    f.write(train.to_csv(columns=["id", "label", "a", lang], header=False, index=False, sep='\t',encoding = 'utf-8'))

  with open('/content/'+lang+"/dev.tsv", 'w') as f:
    f.write(dev.to_csv(columns=["id", "label", "a", lang], header=False, index=False, sep='\t',encoding = 'utf-8'))

  with open('/content/'+lang+"/test.tsv", 'w') as f:
    f.write(test.to_csv(columns=["id", lang], header=False, index=False, sep='\t',encoding = 'utf-8'))

  with open('/content/'+lang+"/annotated_test.tsv", 'w') as f:
    f.write(test.to_csv(columns=["id", "label", "a", lang], header=False, index=False, sep='\t',encoding = 'utf-8'))

  #Save annotated small
  if not os.path.exists(lang+'-sm'):
      os.makedirs(lang+'-sm')
    
  with open('/content/'+lang+'-sm/train.tsv', 'w') as f:
    f.write(train_small.to_csv(columns=["id", "label", "a", lang], header=False, index=False, sep='\t',encoding = 'utf-8'))

  with open('/content/'+lang+'-sm/dev.tsv', 'w') as f:
    f.write(dev.to_csv(columns=["id", "label", "a", lang], header=False, index=False, sep='\t',encoding = 'utf-8'))

  with open('/content/'+lang+'-sm/test.tsv', 'w') as f:
    f.write(test.to_csv(columns=["id", lang], header=False, index=False, sep='\t',encoding = 'utf-8'))

  with open('/content/'+lang+'-sm/annotated_test.tsv', 'w') as f:
    f.write(test.to_csv(columns=["id", "label", "a", lang], header=False, index=False, sep='\t',encoding = 'utf-8'))

#Save annotated all
en = train[["id","label","a","cb-annot-en"]]
en.columns = ["id","label","a","text"]
pt = train[["id","label","a","cb-annot-en-pt"]]
pt.columns = ["id","label","a","text"]
fr = train[["id","label","a","cb-annot-en-fr"]]
fr.columns = ["id","label","a","text"]
de = train[["id","label","a","cb-annot-en-de"]]
de.columns = ["id","label","a","text"]
es = train[["id","label","a","cb-annot-en-es"]]
es.columns = ["id","label","a","text"]

all_lang = pd.concat([en,pt,fr,de,es], join="inner")
all_lang = all_lang.sample(n=len(all_lang), random_state=42)
all_lang.reset_index(inplace=True)  

with open('/content/cb-annot-en-de-fr-es/train.tsv', 'w') as f:
  f.write(all_lang.to_csv(columns=["id", "label", "a", "text"], header=False, index=False, sep='\t',encoding = 'utf-8'))  


#Save annotated all small
en_small = train_small[["id","label","a","cb-annot-en"]]
en_small.columns = ["id","label","a","text"]
pt_small = train_small[["id","label","a","cb-annot-en-pt"]]
pt_small.columns = ["id","label","a","text"]
fr_small = train_small[["id","label","a","cb-annot-en-fr"]]
fr_small.columns = ["id","label","a","text"]
de_small = train_small[["id","label","a","cb-annot-en-de"]]
de_small.columns = ["id","label","a","text"]
es_small = train_small[["id","label","a","cb-annot-en-es"]]
es_small.columns = ["id","label","a","text"]

all_lang_small = pd.concat([en_small,pt_small,fr_small,de_small,es_small], join="inner")
all_lang_small = all_lang.sample(n=len(all_lang_small), random_state=42)
all_lang_small.reset_index(inplace=True)  

with open('/content/cb-annot-en-de-fr-es-sm/train.tsv', 'w') as f:
  f.write(all_lang_small.to_csv(columns=['id', 'label', 'a', "text"], header=False, index=False, sep='\t',encoding = 'utf-8'))  


#unannotated data
languages = ["cb-en","cb-en-fr","cb-en-de","cb-en-es","cb-en-pt"]

if not os.path.exists('cb-unannotated'):
    os.makedirs('cb-unannotated')

for lang in languages:  
  with open('/content/cb-unannotated/'+lang+".tsv", 'w') as f:
    f.write(unannotated.to_csv(columns=["id", lang], header=False, index=False, sep='\t',encoding = 'utf-8'))

print("Created the data files locally")
positive = all_lang[all_lang['label'] == 'positive']
negative = all_lang[all_lang['label'] == 'negative']
neutral = all_lang[all_lang['label'] == 'neutral']


#Balancing the datasets
#Oversampled data
positive = all_lang[all_lang['label'] == 'positive']
negative = all_lang[all_lang['label'] == 'negative']
neutral = all_lang[all_lang['label'] == 'neutral']

positive_os = positive.sample(n=len(neutral), random_state=42, replace=True)
negative_os = negative.sample(n=len(neutral), random_state=42, replace=True)
neutral_os = neutral
all_os = pd.concat([positive_os, negative_os, neutral_os], join="inner")
all_os = all_os.sample(n=len(all_os), random_state=42)
all_os.reset_index(inplace=True) 

#Undersampled data
positive_us = positive.sample(n=len(negative), random_state=42)
negative_us = negative
neutral_us = neutral.sample(n=len(negative), random_state=42)
all_us = pd.concat([positive_us, negative_us, neutral_us], join="inner")
all_us = all_us.sample(n=len(all_us), random_state=42)
all_us.reset_index(inplace=True) 

#Writing balanced data
if not os.path.exists('cb-annot-en-de-fr-es-os'):
    os.makedirs('cb-annot-en-de-fr-es-os')

if not os.path.exists('cb-annot-en-de-fr-es-us'):
    os.makedirs('cb-annot-en-de-fr-es-us')

with open('/content/cb-annot-en-de-fr-es-os/train.tsv', 'w') as f:
  f.write(all_os.to_csv(columns=['id', 'label', 'a', lang], header=False, index=False, sep='\t',encoding = 'utf-8'))  

with open('/content/cb-annot-en-de-fr-es-us/train.tsv', 'w') as f:
  f.write(all_us.to_csv(columns=['id', 'label', 'a', lang], header=False, index=False, sep='\t',encoding = 'utf-8'))  


#Copy everything to the bucket
##!gsutil -m cp -r cb-*/ gs://perepublic/EPFL_multilang/data/

#print("Copied the data to the bucket")


Created the data files locally
