In [195]:
import pandas as pd

## Figure out how to parse the different verb forms

In [196]:
df = pd.read_csv('diccionari.txt',
                 comment='#',
                 sep=' ', header=None,
                 names=['conj_verb', 'inf_verb', 'code'])

In [197]:
# keep only verbal forms
df = df.loc[df.code.str.startswith('V')]

In [198]:
# filter for only the verbs we care about
cat_sp_df = pd.read_csv('spanish_to_catalan_verbs.csv', sep=',',
                        encoding='utf-8')
cat_sp_df = cat_sp_df[['spanish_verb', 'catalan_verb']]
cat_sp_df.rename({'catalan_verb': 'inf_verb'}, axis=1, inplace=True)
print(len(cat_sp_df.index))
df = df.loc[df.inf_verb.isin(cat_sp_df.inf_verb.tolist())]
print(len(df.inf_verb.unique()))

67
64


In [199]:
# # try just fer for now
# df = df.loc[df.inf_verb=='fer']
# df.head()

In [200]:
# region
df['region'] = 'central'
df.loc[df.code.str.endswith('B'), 'region'] = 'balear'
df.loc[df.code.str.endswith('V'), 'region'] = 'valencia'
df.loc[df.code.str.endswith('6'), 'region'] = 'valencia,balear'

In [201]:
# mood
# I = indicative
# M = imperative
# P = participle
# G = gerund
# N = infinitive
# S = subjunctive

split_char = 'V'
split_offset=1

df['mood'] = df.code.str.split(split_char, n=1, expand=True)[1].str.slice(0+split_offset,1+split_offset)

In [202]:
# gender for participles
df['gender'] = df.code.str.split(split_char, n=1, expand=True)[1].str.slice(4+split_offset,5+split_offset)

In [203]:
# tense
# P = present
# I = past imperfect
# F = future
# S = simple past
# C = conditional
df['tense'] = df.code.str.split(split_char, n=1, expand=True)[1].str.slice(1+split_offset,2+split_offset)

In [204]:
# person (1st 2nd 3rd)
df['person'] = df.code.str.split(split_char, n=1, expand=True)[1].str.slice(2+split_offset,3+split_offset)

In [205]:
# singular or plural
df['sing_or_plural'] = df.code.str.split(split_char, n=1, expand=True)[1].str.slice(3+split_offset,4+split_offset)
df.sing_or_plural.unique()

array(['S', 'P', '0'], dtype=object)

In [206]:
# keep only central forms
df = df.loc[df.region == 'central']

# remove passat simple 
df = df.loc[df.tense != 'S']

# get rid of participles save for male single because the others serve only as adjectives
print(len(df.index))
df = df.loc[~((df.mood == 'P') &
              (df.gender == 'F'))]
df = df.loc[~((df.mood == 'P') &
              (df.sing_or_plural == 'P'))]
print(len(df.index))

# get rid of some weird exceptions

# entès, entés --> keep only central variant
df = df.loc[~((df.inf_verb == 'entendre') &
              (df.mood == 'P') &
              (df.gender == 'M') &
              (df.conj_verb == 'entés'))]

# estat, sigut for ser --> keep only sigut because you can practice estat w/ estar
df = df.loc[~((df.inf_verb == 'ser') &
              (df.mood == 'P') &
              (df.gender == 'M') &
              (df.conj_verb == 'estat'))]

# sent, essent for ser --> keep only sent because I don't like how essent looks
print(len(df.index))
df = df.loc[~((df.inf_verb == 'ser') &
              (df.mood == 'G') &
              (df.conj_verb == 'essent'))]

3931
3735
3733


In [207]:
# add perfet, passat perifrastic, negative command
# by looping through each infinitive verb
for infinitive in df.inf_verb.unique().tolist():
    
    # perfet
    haver = [['he', '1', 'S', 'I', 'perfet'],
             ['has', '2', 'S', 'I', 'perfet'],
             ['ha', '3', 'S', 'I', 'perfet'],
             ['hem', '1', 'P', 'I', 'perfet'],
             ['heu', '2', 'P', 'I', 'perfet'],
             ['han', '3', 'P', 'I', 'perfet']]
    haver_df = pd.DataFrame(data=haver,
                            columns=['haver_verb',
                                     'person', 
                                     'sing_or_plural', 
                                     'mood',
                                     'tense'])
    participle = df.loc[(df.inf_verb==infinitive)&\
                        (df.mood=='P')&\
                        (df.gender=='M')&\
                        (df.sing_or_plural=='S')].conj_verb.unique()
    try:
        assert len(participle) == 1
    except:
        print(infinitive)
        print(participle)
    participle = participle[0]
    
    haver_df['conj_verb'] = haver_df.haver_verb+' '+participle
    haver_df['inf_verb'] = infinitive
    df = pd.concat([df, haver_df], axis=0)
    
    # passat perifrastic
    anar = [['vaig', '1', 'S', 'I', 'passat_perifrastic'],
             ['vas', '2', 'S', 'I', 'passat_perifrastic'],
             ['va', '3', 'S', 'I', 'passat_perifrastic'],
             ['vam', '1', 'P', 'I', 'passat_perifrastic'],
             ['vau', '2', 'P', 'I', 'passat_perifrastic'],
             ['van', '3', 'P', 'I', 'passat_perifrastic']]
    anar_df = pd.DataFrame(data=anar,
                            columns=['anar_verb',
                                     'person', 
                                     'sing_or_plural', 
                                     'mood',
                                     'tense'])
    anar_df['conj_verb'] = anar_df.anar_verb+' '+infinitive
    anar_df['inf_verb'] = infinitive
    df = pd.concat([df, anar_df], axis=0)
    

In [208]:
# negative command -- just the subjunctive present
neg_cmd_df = df.loc[(df.mood=='S')&(df.tense=='P')]
neg_cmd_df['pos_neg_cmd'] = 'neg'
neg_cmd_df['mood'] = 'M'
neg_cmd_df['tense'] = '0'

# add positive / neg designations for commands
df['pos_neg_cmd'] = '0'
df.loc[df.mood=='M', 'pos_neg_cmd'] = 'pos'

# now add 
df = pd.concat([df, neg_cmd_df], axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [209]:
# other synthetic tenses?
# plusquamperfet, passat anterior, passat anterior preifrastic, 
# futur perfet... etc.

In [210]:
# df.loc[(df.mood=='S')&(df.tense=='P')].sort_values(by=['mood', 'person', 'tense', 'sing_or_plural'])

In [211]:
haver_df

Unnamed: 0,haver_verb,person,sing_or_plural,mood,tense,conj_verb,inf_verb
0,he,1,S,I,perfet,he volgut,voler
1,has,2,S,I,perfet,has volgut,voler
2,ha,3,S,I,perfet,ha volgut,voler
3,hem,1,P,I,perfet,hem volgut,voler
4,heu,2,P,I,perfet,heu volgut,voler
5,han,3,P,I,perfet,han volgut,voler


In [212]:
df.loc[(df.inf_verb=='actuar')&\
       (df.tense=='perfet')]

Unnamed: 0,conj_verb,inf_verb,code,region,mood,gender,tense,person,sing_or_plural,haver_verb,anar_verb,pos_neg_cmd
0,he actuat,actuar,,,I,,perfet,1,S,he,,0
1,has actuat,actuar,,,I,,perfet,2,S,has,,0
2,ha actuat,actuar,,,I,,perfet,3,S,ha,,0
3,hem actuat,actuar,,,I,,perfet,1,P,hem,,0
4,heu actuat,actuar,,,I,,perfet,2,P,heu,,0
5,han actuat,actuar,,,I,,perfet,3,P,han,,0


In [213]:
df.loc[(df.inf_verb=='fer')&\
       (df.tense=='perfet')]

Unnamed: 0,conj_verb,inf_verb,code,region,mood,gender,tense,person,sing_or_plural,haver_verb,anar_verb,pos_neg_cmd
0,he fet,fer,,,I,,perfet,1,S,he,,0
1,has fet,fer,,,I,,perfet,2,S,has,,0
2,ha fet,fer,,,I,,perfet,3,S,ha,,0
3,hem fet,fer,,,I,,perfet,1,P,hem,,0
4,heu fet,fer,,,I,,perfet,2,P,heu,,0
5,han fet,fer,,,I,,perfet,3,P,han,,0


In [214]:
df.to_csv('catalan_verbs_parsed.tsv', sep='\t', index=False)