### Importing the necessary modules

In [1]:
import pandas as pd
import random

### Filtering the Dataset

Filtering the shifts dataset to only include "Semantic Evolution" and "Microevolution"

In [2]:
data = pd.read_csv('DatSemShift.csv')

SC_data = data[(data['Type'] == ' Semantic evolution') | (data['Type'] == ' Microevolution')]

### Merging the Shifts Data

Merging the shifts that have the same shift id into the same row. The code takes the csv that includes all of the shifts, and groups the shifts that share a shift ID together and merges their remaining columns into the same row. 

In [3]:
#Keeping only the necessary columns in the dataframe
SC_data = SC_data[['ID', 'Type', 'Language_1', 'Lexeme_1', 'Meaning_1', 'Direction', 'Language_2', 'Lexeme_2', 'Meaning_2']]

#First, grouping the rows that have the same value in ID, Language 1, Lexeme 1 and Meaning 1 columns, and merging the Language 2 and 
#Lexeme 2 columns into the same row, every value separated by commas.
merged = SC_data.groupby(['ID', 'Language_1', 'Lexeme_1', 'Meaning_1']).agg({
    'Language_2': lambda x: ','.join(x.unique()),
    'Lexeme_2':   lambda x: ','.join(x.unique()),
    'Meaning_2':  'first',   
    'Type':       'first',   
    'Direction':  'first'
}).reset_index()
merged

Unnamed: 0,ID,Language_1,Lexeme_1,Meaning_1,Language_2,Lexeme_2,Meaning_2,Type,Direction
0,shift0007,Ancient Greek,μετατρέπω,to turn back,Modern Greek,μετατρέπω,to transform,Semantic evolution,→
1,shift0007,Irish Gaelic,iompaigh,"to turn, rotate (intr.)",Irish Gaelic,iompaigh,to become,Semantic evolution,→
2,shift0008,Irish Gaelic,eolas,to know,Irish Gaelic,eolas,to know how,Semantic evolution,→
3,shift0013,Irish Gaelic,comhair,"to calculate, count",Irish Gaelic,comhair,to take into account,Semantic evolution,→
4,shift0016,Irish Gaelic,milis,sweet (taste),Irish Gaelic,milis,"dear, darling",Semantic evolution,→
...,...,...,...,...,...,...,...,...,...
279,shift4646,Welsh,drych,to see/to look at,Welsh,drych,mirror,Semantic evolution,→
280,shift4647,Portuguese,dona,"female owner, mistress",Brazilian Portuguese,dona,wife,Semantic evolution,→
281,shift4663,Latin,libet (impers.),to want,Medieval Latin,quodlibet,anything,Semantic evolution,→
282,shift4720,Old East Slavic,дрѣмѫчии,sleeping,Old East Slavic,дремучий,dense (forest),Semantic evolution,→


In [4]:
#To not to lose any info related to the realizations of the shifts, encoding info into the same row
merged['Lexemes_1'] = merged['Language_1'] + ': [' + merged['Lexeme_1'] + ']'
merged['Meanings_1'] = merged['Lexeme_1'] + ': [' + merged['Meaning_1'] + ']'
merged['Languages_2'] = merged['Language_1'] + ': [' + merged['Language_2'] + ']'
merged['Lexemes_2'] = merged['Lexeme_1'] + ': [' + merged['Lexeme_2'] + ']'
merged['Meanings_2'] = merged['Lexeme_2'] + ': [' + merged['Meaning_2'] + ']'
merged

Unnamed: 0,ID,Language_1,Lexeme_1,Meaning_1,Language_2,Lexeme_2,Meaning_2,Type,Direction,Lexemes_1,Meanings_1,Languages_2,Lexemes_2,Meanings_2
0,shift0007,Ancient Greek,μετατρέπω,to turn back,Modern Greek,μετατρέπω,to transform,Semantic evolution,→,Ancient Greek: [μετατρέπω],μετατρέπω: [to turn back],Ancient Greek: [Modern Greek],μετατρέπω: [μετατρέπω],μετατρέπω: [to transform]
1,shift0007,Irish Gaelic,iompaigh,"to turn, rotate (intr.)",Irish Gaelic,iompaigh,to become,Semantic evolution,→,Irish Gaelic: [iompaigh],"iompaigh: [to turn, rotate (intr.)]",Irish Gaelic: [Irish Gaelic],iompaigh: [iompaigh],iompaigh: [to become]
2,shift0008,Irish Gaelic,eolas,to know,Irish Gaelic,eolas,to know how,Semantic evolution,→,Irish Gaelic: [eolas],eolas: [to know],Irish Gaelic: [Irish Gaelic],eolas: [eolas],eolas: [to know how]
3,shift0013,Irish Gaelic,comhair,"to calculate, count",Irish Gaelic,comhair,to take into account,Semantic evolution,→,Irish Gaelic: [comhair],"comhair: [to calculate, count]",Irish Gaelic: [Irish Gaelic],comhair: [comhair],comhair: [to take into account]
4,shift0016,Irish Gaelic,milis,sweet (taste),Irish Gaelic,milis,"dear, darling",Semantic evolution,→,Irish Gaelic: [milis],milis: [sweet (taste)],Irish Gaelic: [Irish Gaelic],milis: [milis],"milis: [dear, darling]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,shift4646,Welsh,drych,to see/to look at,Welsh,drych,mirror,Semantic evolution,→,Welsh: [drych],drych: [to see/to look at],Welsh: [Welsh],drych: [drych],drych: [mirror]
280,shift4647,Portuguese,dona,"female owner, mistress",Brazilian Portuguese,dona,wife,Semantic evolution,→,Portuguese: [dona],"dona: [female owner, mistress]",Portuguese: [Brazilian Portuguese],dona: [dona],dona: [wife]
281,shift4663,Latin,libet (impers.),to want,Medieval Latin,quodlibet,anything,Semantic evolution,→,Latin: [libet (impers.)],libet (impers.): [to want],Latin: [Medieval Latin],libet (impers.): [quodlibet],quodlibet: [anything]
282,shift4720,Old East Slavic,дрѣмѫчии,sleeping,Old East Slavic,дремучий,dense (forest),Semantic evolution,→,Old East Slavic: [дрѣмѫчии],дрѣмѫчии: [sleeping],Old East Slavic: [Old East Slavic],дрѣмѫчии: [дремучий],дремучий: [dense (forest)]


In [5]:
#Merging all realizations of the shifts together
merged2 = merged.groupby(['ID']).agg({
    'Language_1': lambda x: ', '.join(x.unique()),
    'Lexemes_1':   lambda x: ', '.join(x.unique()),
    'Meanings_1':  lambda x: ', '.join(x.unique()),
    'Meaning_1': lambda x: ','.join(x.unique()),  
    'Languages_2': lambda x: ', '.join(x.unique()), 
    'Lexemes_2':   lambda x: ', '.join(x.unique()),
    'Meanings_2':  lambda x: ', '.join(x.unique()),
    'Meaning_2': lambda x: ','.join(x.unique()),
    'Type':       'first',   
    'Direction':  'first'
}).reset_index()

merged2

Unnamed: 0,ID,Language_1,Lexemes_1,Meanings_1,Meaning_1,Languages_2,Lexemes_2,Meanings_2,Meaning_2,Type,Direction
0,shift0007,"Ancient Greek, Irish Gaelic","Ancient Greek: [μετατρέπω], Irish Gaelic: [iom...","μετατρέπω: [to turn back], iompaigh: [to turn,...","to turn back,to turn, rotate (intr.)","Ancient Greek: [Modern Greek], Irish Gaelic: [...","μετατρέπω: [μετατρέπω], iompaigh: [iompaigh]","μετατρέπω: [to transform], iompaigh: [to become]","to transform,to become",Semantic evolution,→
1,shift0008,Irish Gaelic,Irish Gaelic: [eolas],eolas: [to know],to know,Irish Gaelic: [Irish Gaelic],eolas: [eolas],eolas: [to know how],to know how,Semantic evolution,→
2,shift0013,Irish Gaelic,Irish Gaelic: [comhair],"comhair: [to calculate, count]","to calculate, count",Irish Gaelic: [Irish Gaelic],comhair: [comhair],comhair: [to take into account],to take into account,Semantic evolution,→
3,shift0016,Irish Gaelic,Irish Gaelic: [milis],milis: [sweet (taste)],sweet (taste),Irish Gaelic: [Irish Gaelic],milis: [milis],"milis: [dear, darling]","dear, darling",Semantic evolution,→
4,shift0035,"French, Irish Gaelic","French: [considérer], Irish Gaelic: [feic]","considérer: [to stare, to gaze], feic: [to see...","to stare, to gaze,to see/to look at","French: [French], Irish Gaelic: [Irish Gaelic]","considérer: [considérer], feic: [feic]","considérer: [to consider], feic: [to have opin...","to consider,to have opinion",Semantic evolution,→
...,...,...,...,...,...,...,...,...,...,...,...
225,shift4646,Welsh,Welsh: [drych],drych: [to see/to look at],to see/to look at,Welsh: [Welsh],drych: [drych],drych: [mirror],mirror,Semantic evolution,→
226,shift4647,Portuguese,Portuguese: [dona],"dona: [female owner, mistress]","female owner, mistress",Portuguese: [Brazilian Portuguese],dona: [dona],dona: [wife],wife,Semantic evolution,→
227,shift4663,Latin,Latin: [libet (impers.)],libet (impers.): [to want],to want,Latin: [Medieval Latin],libet (impers.): [quodlibet],quodlibet: [anything],anything,Semantic evolution,→
228,shift4720,Old East Slavic,Old East Slavic: [дрѣмѫчии],дрѣмѫчии: [sleeping],sleeping,Old East Slavic: [Old East Slavic],дрѣмѫчии: [дремучий],дремучий: [dense (forest)],dense (forest),Semantic evolution,→


In [6]:
#Rearranging the columns
merged2 = merged2.iloc[:, [0, 9, 1, 2, 3, 4, 10, 5, 6, 7, 8]]
merged2
merged2.to_csv('MergedShifts.csv')

### Sampling the Data

Randomly sampling 200 rows from a dataframe for annotation. 100 of these rows are saved with all the information and the other 100 rows are saved without any language information.

In [8]:
#To ensure that the results are consistent across trial
random.seed(3)

#Importing the dataframe
data = pd.read_csv('MergedShifts.csv')

#Since this column is the row number, it is unique to each row. So it is used for sampling.
#Putting all row numbers into a list
id_nums = data[['Unnamed: 0']].values.tolist()

#Since after 'tolist' items are put as list into list, this loop puts them as single items into a list
ids = []
for i in id_nums:
    ids.append(i[0])

#Sampling 200 row numbers
sample_id = random.sample(ids, 200)

#Sampling 100 row numbers from the sampled row numbers for the rows with all information
id_lang = random.sample(sample_id, 100)

#Removing the 100 sampled row numbers from all of the row numbers
id_nolang = list(set(sample_id) - set(id_lang))

#Saving the first sample of rows
lang = data[data['Unnamed: 0'].isin(id_lang)]

#Saving the second sample of rows
nolang = data[data['Unnamed: 0'].isin(id_nolang)]

#Removing the language information from the second sample and the row number column, only keeping the meaning column without lexeme info
nolang = nolang[['ID', 'Meaning_1', 'Direction', 'Meaning_2']]

#Removing the row number column from the first sample and meaning info without the lexeme info columns
lang = lang[['ID', 'Language_1', 'Lexemes_1', 'Meanings_1', 'Direction',
       'Languages_2', 'Lexemes_2', 'Meanings_2']]

#Saving the sample with language information
lang.to_csv('RandomShiftsLang.csv')

#Saving the sample without language information
nolang.to_csv('RandomShifts.csv')
