<a href="https://colab.research.google.com/github/choppa98/Supervised-semantic-proximity-detection/blob/main/one_for_all_thesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook loads datasets of semantic proximity (Word-in-Context) for various languages the [WUG format](https://www.ims.uni-stuttgart.de/en/research/resources/experiment-data/wugs/). We provide the data in a minimal and an extended format. There are in total 4 dataframes: judgments_full, judgments_wug, uses_full and uses_wug. There are 20 transformed datasets. The notebook should run of-the-shelf in a colab environment with python 3.8.

Many of the data sets are transformed when running the notebook. We cannot guarantee that there are no errors. Hence, please make sure that you compare the created data frames to the original data sets before doing serious research with them.

Note: Please run this script without gpu on colab.

The datasets and their versions are as follows:

#RuDSI - Russian
rudsi = 'https://github.com/kategavrishina/RuDSI/tree/main/data'

#NorDiaChange - Norwegian
nordia1 = 'https://github.com/ltgoslo/nor_dia_change/tree/main/subset1/data'
nordia2 = 'https://github.com/ltgoslo/nor_dia_change/tree/main/subset2/data'

#RuShiftEval - Russian

https://github.com/akutuzov/rushifteval_public.git
rushifteval1 = 'https://github.com/akutuzov/rushifteval_public/tree/main/durel/rushifteval1/data'
rushifteval2 = 'https://github.com/akutuzov/rushifteval_public/tree/main/durel/rushifteval2/data'
rushifteval3 = 'https://github.com/akutuzov/rushifteval_public/tree/main/durel/rushifteval3/data'

#RuSemShift - Russian
rusemshift1 = 'https://github.com/juliarodina/RuSemShift/tree/master/rusemshift_1/DWUG/data'
rusemshift2 = 'https://github.com/juliarodina/RuSemShift/tree/master/rusemshift_2/DWUG/data'

#DiscoWUG - German (Version: 1.1.1)
https://zenodo.org/record/7396225/files/discowug.zip



#SURel - German (Version: 3.0.0)
https://zenodo.org/record/5784569/files/surel.zip


#DURel - German (Version: 3.0.0)
https://zenodo.org/record/5784453/files/durel.zip


#DWUG DE- German (Version: 2.3.0)
https://zenodo.org/record/7441645/files/dwug_de.zip


#RefWUG - German (Version: 1.1.0)
https://zenodo.org/record/5791269/files/refwug.zip


#DWUG EN - English (Version: 2.0.1)
https://zenodo.org/record/7387261/files/dwug_en.zip


#DWUG SV - Swedish(Version: 2.0.1)
https://zenodo.org/record/7389506/files/dwug_sv.zip


#DWUG ES - Spanish(Version: 4.0.0)
https://zenodo.org/record/6433667/files/dwug_es.zip


#DiaWUG - Spanish (Version: 1.1.0)
https://zenodo.org/record/5791193/files/diawug.zip


#DUPS_WUG - English (version 2.0.0)
https://zenodo.org/record/5500223/files/DUPS-WUG.zip

#WIC - English (version v1.0)
https://pilehvar.github.io/wic/package/WiC_dataset.zip

#TempoWIC - English
https://codalab.lisn.upsaclay.fr/my/datasets/download/3e22f138-ca00-4b10-a0fd-2e914892200d

#Raw-C - English
https://raw.githubusercontent.com/seantrott/raw-c/main/data/processed/raw-c.csv

#Usim - English
http://www.dianamccarthy.co.uk/downloads/WordMeaningAnno2012/

#CosimLex - English, Croatian, Finnish
https://www.clarin.si/repository/xmlui/handle/11356/1308/allzip


In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import io
import numpy as np
import os
from zipfile import ZipFile
import csv

In [2]:
!git clone https://github.com/Garrafao/WUGs.git #contains transformation scripts

Cloning into 'WUGs'...
remote: Enumerating objects: 1761, done.[K
remote: Counting objects: 100% (687/687), done.[K
remote: Compressing objects: 100% (320/320), done.[K
remote: Total 1761 (delta 403), reused 604 (delta 349), pack-reused 1074[K
Receiving objects: 100% (1761/1761), 5.08 MiB | 6.02 MiB/s, done.
Resolving deltas: 100% (1081/1081), done.


In [3]:
!git clone "https://github.com/akutuzov/rushifteval_public.git" #rushifteval
!git clone "https://github.com/juliarodina/RuSemShift.git" #rusemshift
!git clone "https://github.com/kategavrishina/RuDSI.git" #rudsi
!git clone "https://github.com/ltgoslo/nor_dia_change.git" #nordiachange

Cloning into 'rushifteval_public'...
remote: Enumerating objects: 3238, done.[K
remote: Counting objects: 100% (3238/3238), done.[K
remote: Compressing objects: 100% (1376/1376), done.[K
remote: Total 3238 (delta 1918), reused 3167 (delta 1859), pack-reused 0[K
Receiving objects: 100% (3238/3238), 16.40 MiB | 12.12 MiB/s, done.
Resolving deltas: 100% (1918/1918), done.
Updating files: 100% (3704/3704), done.
Cloning into 'RuSemShift'...
remote: Enumerating objects: 2100, done.[K
remote: Counting objects: 100% (2100/2100), done.[K
remote: Compressing objects: 100% (991/991), done.[K
remote: Total 2100 (delta 1182), reused 2013 (delta 1108), pack-reused 0[K
Receiving objects: 100% (2100/2100), 9.80 MiB | 6.49 MiB/s, done.
Resolving deltas: 100% (1182/1182), done.
Cloning into 'RuDSI'...
remote: Enumerating objects: 311, done.[K
remote: Counting objects: 100% (226/226), done.[K
remote: Compressing objects: 100% (167/167), done.[K
remote: Total 311 (delta 73), reused 193 (delta 

In [11]:
#RuDSI
rudsi = 'RuDSI/data/'

#NorDiaChange
nordia1 = 'nor_dia_change/subset1/data/'
nordia2 = 'nor_dia_change/subset2/data/'

#RuShiftEval
rushifteval1 = 'rushifteval_public/durel/rushifteval1/data/'
rushifteval2 = 'rushifteval_public/durel/rushifteval2/data/'
rushifteval3 = 'rushifteval_public/durel/rushifteval3/data/'

#RuSemShift
rusemshift1 = 'RuSemShift/rusemshift_1/DWUG/data/'
rusemshift2 = 'RuSemShift/rusemshift_2/DWUG/data/'

#Discowug
!wget https://zenodo.org/record/7396225/files/discowug.zip
with ZipFile('discowug.zip', 'r') as discowug:
    discowug.extractall()


#surel
!wget https://zenodo.org/record/5784569/files/surel.zip
with ZipFile('surel.zip', 'r') as surel:
    surel.extractall()

#durel
!wget https://zenodo.org/record/5784453/files/durel.zip
with ZipFile('durel.zip', 'r') as durel:
    durel.extractall()

#DWUG DE
!wget https://zenodo.org/record/7441645/files/dwug_de.zip
with ZipFile('dwug_de.zip', 'r') as dwug_de:
    dwug_de.extractall()

#RefWUG
!wget https://zenodo.org/record/5791269/files/refwug.zip
with ZipFile('refwug.zip', 'r') as refwug:
    refwug.extractall()

#DWUG EN
!wget https://zenodo.org/record/7387261/files/dwug_en.zip
with ZipFile('dwug_en.zip', 'r') as dwug_en:
    dwug_en.extractall()


#DWUG SV
!wget https://zenodo.org/record/7389506/files/dwug_sv.zip
with ZipFile('dwug_sv.zip', 'r') as dwug_sv:
    dwug_sv.extractall()


#DWUG ES
!wget https://zenodo.org/record/6433667/files/dwug_es.zip
with ZipFile('dwug_es.zip', 'r') as dwug_es:
    dwug_es.extractall()

#DiaWUG
!wget https://zenodo.org/record/5791193/files/diawug.zip
with ZipFile('diawug.zip', 'r') as diawug:
    diawug.extractall()


#chiWUG
!wget https://zenodo.org/records/10023263/files/chiwug.zip
with ZipFile('chiwug.zip', 'r') as chiwug:
    chiwug.extractall()


--2024-07-18 08:42:45--  https://zenodo.org/record/7396225/files/discowug.zip
Resolving zenodo.org (zenodo.org)... 188.185.79.172, 188.184.103.159, 188.184.98.238, ...
Connecting to zenodo.org (zenodo.org)|188.185.79.172|:443... connected.
HTTP request sent, awaiting response... 301 MOVED PERMANENTLY
Location: /records/7396225/files/discowug.zip [following]
--2024-07-18 08:42:46--  https://zenodo.org/records/7396225/files/discowug.zip
Reusing existing connection to zenodo.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 5486849 (5.2M) [application/octet-stream]
Saving to: ‘discowug.zip’


2024-07-18 08:42:55 (562 KB/s) - ‘discowug.zip’ saved [5486849/5486849]

--2024-07-18 08:42:56--  https://zenodo.org/record/5784569/files/surel.zip
Resolving zenodo.org (zenodo.org)... 188.185.79.172, 188.184.103.159, 188.184.98.238, ...
Connecting to zenodo.org (zenodo.org)|188.185.79.172|:443... connected.
HTTP request sent, awaiting response... 301 MOVED PERMANENTLY
Location: /record

In [17]:
!wget https://zenodo.org/records/10023263/files/chiwug.zip
with ZipFile('chiwug.zip', 'r') as chiwug:
    chiwug.extractall()


--2024-07-18 08:52:43--  https://zenodo.org/records/10023263/files/chiwug.zip
Resolving zenodo.org (zenodo.org)... 188.185.79.172, 188.184.98.238, 188.184.103.159, ...
Connecting to zenodo.org (zenodo.org)|188.185.79.172|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5891045 (5.6M) [application/octet-stream]
Saving to: ‘chiwug.zip’


2024-07-18 08:52:53 (602 KB/s) - ‘chiwug.zip’ saved [5891045/5891045]



In [19]:
dwugde = "dwug_de/data"                          #WUG data directory paths
dwugen = "dwug_en/data"
dwugsv = "dwug_sv/data"
discowugg = "discowug/data"
durel = "durel/data"
surel = "surel/data"
refwug = "refwug/data"
dwuges = 'dwug_es/data'
diawug = 'diawug/data'
chiwug = 'chiwug/data'
dwug = [dwugde, dwugen,dwugsv,discowugg, durel, surel, refwug, dwuges, diawug, chiwug]
dirlist = []
for dataset in dwug:
  dir = os.listdir(dataset)
  dirlist.append(dir)

In [20]:
dwug_j = []                                                #dwug data paths
for i in dirlist[0]:
  dwugde_j = "dwug_de/data/" + i + "/judgments.csv"
  dwug_j.append(dwugde_j)
for i in dirlist[1]:
  dwugen_j = "dwug_en/data/" + i + "/judgments.csv"
  dwug_j.append(dwugen_j)
for i in dirlist[2]:
  dwugsv_j = "dwug_sv/data/" + i + "/judgments.csv"
  dwug_j.append(dwugsv_j)
for i in dirlist[3]:
  discowugg_j = "discowug/data/" + i + "/judgments.csv"
  dwug_j.append(discowugg_j)
for i in dirlist[4]:
  durel_j = "durel/data/" + i + "/judgments.csv"
  dwug_j.append(durel_j)
for i in dirlist[5]:
  surel_j = "surel/data/" + i + "/judgments.csv"
  dwug_j.append(surel_j)
for i in dirlist[6]:
  refwug_j = "refwug/data/" + i + "/judgments.csv"
  dwug_j.append(refwug_j)
for i in dirlist[7]:
  dwuges_j = "dwug_es/data/" + i + "/judgments.csv"
  dwug_j.append(dwuges_j)
for i in dirlist[8]:
  diawug_j = "diawug/data/" + i + "/judgments.csv"
  dwug_j.append(diawug_j)
for i in dirlist[9]:
  dups_j = "chiwug/data/" + i + "/judgments.csv"
  dwug_j.append(dups_j)

In [21]:
judgemnt_df = pd.DataFrame()            #dwug data judgments df
for i in dwug_j:
   Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
   Tmp['dataset'] = i.split('/')[0]
   judgemnt_df = pd.concat([judgemnt_df, Tmp])


In [23]:
judgemnt_df[judgemnt_df.duplicated()]

Unnamed: 0,identifier1,identifier2,annotator,judgment,comment,lemma,round,dataset,group
0,2532889X_1982-10-23_01_216.tcf.xml-7-17,26120215_1986_09_20_01_314.tcf.xml-6-10,annotator1,3.0,,Eintagsfliege,3.0,dwug_de,
1,brehm_thierleben06_1869-8719-2,boelsche_liebesleben01_1898-301-6,annotator1,2.0,,Eintagsfliege,3.0,dwug_de,
2,2532889X_1958-11-12_01_012.tcf.xml-74-12,vogt_briefe02_1851-6076-26,annotator1,2.0,,Eintagsfliege,3.0,dwug_de,
3,frapan_fluegel_1895-2258-4,26120215_1984_01_14_01_119.tcf.xml-67-8,annotator1,2.0,,Eintagsfliege,3.0,dwug_de,
4,boelsche_liebesleben01_1898-681-69,26120215_1975_06_23_01_053.tcf.xml-3-39,annotator1,3.0,,Eintagsfliege,3.0,dwug_de,
...,...,...,...,...,...,...,...,...,...
1555,1985-60,1987-79,annotator1,4.0,,炒,,chiwug,
1556,1972-42,1958-9,annotator1,4.0,,炒,,chiwug,
1557,1958-30,1982-72,annotator1,4.0,,炒,,chiwug,
1558,1958-30,1961-45,annotator1,0.0,,炒,,chiwug,


In [24]:
dwug_u = []                                           #dwug data uses paths
for i in dirlist[0]:
  dwugde_u = "dwug_de/data/" + i + "/uses.csv"
  dwug_u.append(dwugde_u)
for i in dirlist[1]:
  dwugen_u = "dwug_en/data/" + i + "/uses.csv"
  dwug_u.append(dwugen_u)
for i in dirlist[2]:
  dwugsv_u = "dwug_sv/data/" + i + "/uses.csv"
  dwug_u.append(dwugsv_u)
for i in dirlist[3]:
  discowugg_u = "discowug/data/" + i + "/uses.csv"
  dwug_u.append(discowugg_u)
for i in dirlist[4]:
  durel_u = "durel/data/" + i + "/uses.csv"
  dwug_u.append(durel_u)
for i in dirlist[5]:
  surel_u = "surel/data/" + i + "/uses.csv"
  dwug_u.append(surel_u)
for i in dirlist[6]:
  refwug_u = "refwug/data/" + i + "/uses.csv"
  dwug_u.append(refwug_u)
for i in dirlist[7]:
  dwuges_u = "dwug_es/data/" + i + "/uses.csv"
  dwug_u.append(dwuges_u)
for i in dirlist[8]:
  diawug_u = "diawug/data/" + i + "/uses.csv"
  dwug_u.append(diawug_u)
for i in dirlist[9]:
  dups_u = "chiwug/data/" + i + "/uses.csv"
  dwug_u.append(dups_u)

In [25]:
judgemnt_df.loc[judgemnt_df["dataset"] == "dwug_de", "language"] = 'German'
judgemnt_df.loc[judgemnt_df["dataset"] == "dwug_en", "language"] = 'English'
judgemnt_df.loc[judgemnt_df["dataset"] == "chiwug", "language"] = 'Chinese'
judgemnt_df.loc[judgemnt_df["dataset"] == "dwug_es", "language"] = 'Spanish'
judgemnt_df.loc[judgemnt_df["dataset"] == "dwug_sv", "language"] = 'Swedish'
judgemnt_df.loc[judgemnt_df["dataset"] == "durel", "language"] = 'German'
judgemnt_df.loc[judgemnt_df["dataset"] == "surel", "language"] = 'German'
judgemnt_df.loc[judgemnt_df["dataset"] == "discowug", "language"] = 'German'
judgemnt_df.loc[judgemnt_df["dataset"] == "refwug", "language"] = 'German'
judgemnt_df.loc[judgemnt_df["dataset"] == "diawug", "language"] = 'Spanish'


In [26]:
judgment_df = judgemnt_df.reset_index(drop=True)

In [27]:
usee_df = pd.DataFrame()            #uses dwug df
for i in dwug_u:
    Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
    Tmp['dataset'] = i.split('/')[0]
    usee_df = pd.concat([usee_df, Tmp])

In [28]:
usee_df.loc[usee_df["dataset"] == "dwug_de", "language"] = 'German'
usee_df.loc[usee_df["dataset"] == "dwug_en", "language"] = 'English'
usee_df.loc[usee_df["dataset"] == "chiwug", "language"] = 'Chinese'
usee_df.loc[usee_df["dataset"] == "dwug_es", "language"] = 'Spanish'
#usee_df.loc[usee_df["dataset"] == "dwug_la", "language"] = 'latin'
usee_df.loc[usee_df["dataset"] == "dwug_sv", "language"] = 'Swedish'
usee_df.loc[usee_df["dataset"] == "durel", "language"] = 'German'
usee_df.loc[usee_df["dataset"] == "surel", "language"] = 'German'
usee_df.loc[usee_df["dataset"] == "discowug", "language"] = 'German'
usee_df.loc[usee_df["dataset"] == "refwug", "language"] = 'German'
usee_df.loc[usee_df["dataset"] == "diawug", "language"] = 'Spanish'

In [29]:
#getting the data
rudsi_f = os.listdir(rudsi)
nordia_f1= os.listdir(nordia1)
nordia_f2 =os.listdir(nordia2)
rushift_f1 = os.listdir(rushifteval1)
rushift_f2 = os.listdir(rushifteval2)
rushift_f3 = os.listdir(rushifteval3)
rusem_f1 = os.listdir(rusemshift1)
rusem_f2 = os.listdir(rusemshift2)

In [30]:
judgements_rusem = []
judgements_nordia = []
judgements_rudsi = []
judgements_rushift = []
uses_rusem = []
uses_nordia = []
uses_rudsi = []
uses_rushift = []


In [31]:
for j in rudsi_f :
      judgements_rudsi.append(rudsi+j+"/judgments.csv")
      uses_rudsi.append(rudsi+j+"/uses.csv")
for j in rusem_f1:
      judgements_rusem.append(rusemshift1+j+"/judgments.csv")
      uses_rusem.append(rusemshift1+j+"/uses.csv")
for j in rusem_f2:
      judgements_rusem.append(rusemshift2+j+"/judgments.csv")
      uses_rusem.append(rusemshift2+j+"/uses.csv")
for j in rushift_f1 :
      judgements_rushift.append(rushifteval1+j+"/judgments.csv")
      uses_rushift.append(rushifteval1+j+"/uses.csv")
for j in rushift_f2 :
      judgements_rushift.append(rushifteval2+j+"/judgments.csv")
      uses_rushift.append(rushifteval2+j+"/uses.csv")
for j in rushift_f3 :
      judgements_rushift.append(rushifteval3+j+"/judgments.csv")
      uses_rushift.append(rushifteval3+j+"/uses.csv")
for j in nordia_f1 :
      judgements_nordia.append(nordia1+j+"/judgments.csv")
      uses_nordia.append(nordia1+j+"/uses.csv")
for j in nordia_f2:
      judgements_nordia.append(nordia2+j+"/judgments.csv")
      uses_nordia.append(nordia2+j+"/uses.csv")

In [32]:
#judgments dataframe for rudsi, rusemshift, rushifteval, nordiachange
jud_rudsi = pd.DataFrame()
for i in judgements_rudsi:
    Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
    Tmp['dataset'] = i.split('/')[0]
    jud_rudsi = pd.concat([jud_rudsi, Tmp])


In [33]:
jud_rusems = pd.DataFrame()
for i in judgements_rusem:
   Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
   Tmp['dataset'] = i.split('/')[1]
   jud_rusems = pd.concat([jud_rusems, Tmp])


In [34]:
jud_rushift = pd.DataFrame()
for i in judgements_rushift:
   Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
   Tmp['dataset'] = i.split('/')[2]
   jud_rushift = pd.concat([jud_rushift, Tmp])

In [35]:
jud_nordia = pd.DataFrame()
for i in judgements_nordia:
   Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
   Tmp['dataset'] = i.split('/')[1]
   jud_nordia = pd.concat([jud_nordia, Tmp])

In [37]:
jud_nordia.loc[jud_nordia['dataset'] == 'subset1', 'dataset'] = 'NorDiaChange1'
jud_nordia.loc[jud_nordia['dataset'] == 'subset2', 'dataset'] = 'NorDiaChange2'

In [38]:
judgements_df = pd.DataFrame()
judgements_df = pd.concat([judgements_df, jud_rudsi])
judgements_df = pd.concat([judgements_df, jud_rusems])
judgements_df = pd.concat([judgements_df, jud_rushift])
judgements_df = pd.concat([judgements_df, jud_nordia])

In [39]:
judgements_df["language"] = "Russian"

In [40]:
judgements_df.loc[judgements_df["dataset"] == "NorDiaChange1", "language"] = 'Norwegian'
judgements_df.loc[judgements_df["dataset"] == "NorDiaChange2", "language"] = 'Norwegian'

In [41]:
use_rudsi = pd.DataFrame()
for i in uses_rudsi:
   Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
   Tmp['dataset'] = i.split('/')[0]
   use_rudsi = pd.concat([use_rudsi, Tmp])

In [42]:
use_rusems = pd.DataFrame()
for i in uses_rusem:
   Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
   Tmp['dataset'] = i.split('/')[1]
   use_rusems = pd.concat([use_rusems, Tmp])

In [43]:
use_rushift = pd.DataFrame()
for i in uses_rushift:
   Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
   Tmp['dataset'] = i.split('/')[2]
   use_rushift = pd.concat([use_rushift, Tmp])

In [44]:
use_nordia = pd.DataFrame()
for i in uses_nordia:
   Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
   Tmp['dataset'] = i.split('/')[1]
   use_nordia = pd.concat([use_nordia, Tmp])

In [45]:
use_nordia.loc[use_nordia['dataset'] == 'subset1', 'dataset'] = 'NorDiaChange1'
use_nordia.loc[use_nordia['dataset'] == 'subset2', 'dataset'] = 'NorDiaChange2'

In [46]:
usees_df = pd.DataFrame()
usees_df = pd.concat([usees_df, use_rudsi])
usees_df = pd.concat([usees_df, use_rusems])
usees_df = pd.concat([usees_df, use_rushift])
usees_df = pd.concat([usees_df, use_nordia])

In [47]:
usees_df['language'] = 'Russian'
usees_df.loc[usees_df["dataset"] == "NorDiaChange1", "language"] = 'Norwegian'
usees_df.loc[usees_df["dataset"] == "NorDiaChange2", "language"] = 'Norwegian'

In [48]:
#final judgments dataframe full format
judgments_full = pd.concat([judgment_df, judgements_df], axis = 0)

In [49]:
judgments_full.dataset.unique()

array(['dwug_de', 'dwug_en', 'dwug_sv', 'discowug', 'durel', 'surel',
       'refwug', 'dwug_es', 'diawug', 'chiwug', 'RuDSI', 'rusemshift_1',
       'rusemshift_2', 'rushifteval1', 'rushifteval2', 'rushifteval3',
       'NorDiaChange1', 'NorDiaChange2'], dtype=object)

In [50]:
#final uses dataframe full format
uses_full = pd.concat([usee_df, usees_df], axis=0)

In [51]:
#resetting the index of uses and judgments dataframes because they have repeated indices
judgments_full = judgments_full.reset_index(drop= True)
uses_full = uses_full.reset_index(drop= True)

In [52]:
#final uses and judgments in wug format
judgments_wug = judgments_full[["identifier1", "identifier2", "annotator", "judgment", "comment", "lemma", "dataset", "language"]]
uses_wug= uses_full[['lemma', 'pos', 'date', 'grouping', 'identifier', 'description', 'context', 'indexes_target_token', 'indexes_target_sentence', 'dataset', 'language']]

In [53]:
dup = uses_full[uses_full.duplicated()] #to gwt duplicates

In [60]:
judgments_wug.to_csv('final_judgments.csv',index = False, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE, quotechar = '')

In [61]:
uses_wug.to_csv('final_uses.csv',index = False, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE, quotechar = '')