In [1]:
import pandas as pd

## Figure out how to parse the different verb forms

In [19]:
df = pd.read_csv('diccionari.txt',
                 comment='#',
                 sep=' ', header=None,
                 names=['conj_verb', 'inf_verb', 'code'])

In [20]:
# keep only verbal forms
df = df.loc[df.code.str.startswith('VM')]

In [21]:
# try just fer for now
df = df.loc[df.inf_verb=='fer']
df.head()

Unnamed: 0,conj_verb,inf_verb,code
655777,fa,fer,VMIP3S00
656048,facem,fer,VMM01P0B
656049,facem,fer,VMSP1P0B
656050,facen,fer,VMM03P0V
656051,facen,fer,VMSP3P0V


In [22]:
# region
df['region'] = 'central'
df.loc[df.code.str.endswith('B'), 'region'] = 'balear'
df.loc[df.code.str.endswith('V'), 'region'] = 'valencia'

In [23]:
# mood
# I = indicative
# M = imperative
# P = participle
# G = gerund
# N = infinitive
# S = subjunctive

df['mood'] = df.code.str.split('VM', expand=True)[1].str.slice(0,1)

In [24]:
# gender for participles
df['gender'] = df.code.str.split('VM', expand=True)[1].str.slice(4,5)

In [25]:
# tense
# P = present
# I = past imperfect
# F = future
# S = simple past
# C = conditional
df['tense'] = df.code.str.split('VM', expand=True)[1].str.slice(1,2)

In [26]:
# person (1st 2nd 3rd)
df['person'] = df.code.str.split('VM', expand=True)[1].str.slice(2,3)

In [27]:
# singular or plural
df['sing_or_plural'] = df.code.str.split('VM', expand=True)[1].str.slice(3,4)
df.sing_or_plural.unique()

array(['S', 'P', '0'], dtype=object)

In [28]:
# keep only central forms
df = df.loc[df.region == 'central']

# remove passat simple 
df = df.loc[df.tense != 'S']

In [29]:
# add perfet, passat perifrastic, negative command

In [30]:
# perfet
haver = [['he', '1', 'S', 'I', 'perfet'],
         ['has', '2', 'S', 'I', 'perfet'],
         ['ha', '3', 'S', 'I', 'perfet'],
         ['hem', '1', 'P', 'I', 'perfet'],
         ['heu', '2', 'P', 'I', 'perfet'],
         ['han', '3', 'P', 'I', 'perfet']]
haver_df = pd.DataFrame(data=haver,
                        columns=['haver_verb',
                                 'person', 
                                 'sing_or_plural', 
                                 'mood',
                                 'tense'])
infinitive = df.loc[(df.mood=='N')].conj_verb.values[0]
participle = df.loc[(df.mood=='P')&\
                    (df.gender=='M')&\
                    (df.sing_or_plural=='S')].conj_verb.values[0]
haver_df['conj_verb'] = haver_df.haver_verb+' '+participle
haver_df['inf_verb'] = infinitive
df = pd.concat([df, haver_df], axis=0)

In [31]:
# passat perifrastic
anar = [['vaig', '1', 'S', 'I', 'passat_perifrastic'],
         ['vas', '2', 'S', 'I', 'passat_perifrastic'],
         ['va', '3', 'S', 'I', 'passat_perifrastic'],
         ['vam', '1', 'P', 'I', 'passat_perifrastic'],
         ['vau', '2', 'P', 'I', 'passat_perifrastic'],
         ['van', '3', 'P', 'I', 'passat_perifrastic']]
anar_df = pd.DataFrame(data=anar,
                        columns=['anar_verb',
                                 'person', 
                                 'sing_or_plural', 
                                 'mood',
                                 'tense'])
infinitive = df.loc[(df.mood=='N')].conj_verb.values[0]
anar_df['conj_verb'] = anar_df.anar_verb+' '+infinitive
anar_df['inf_verb'] = infinitive
df = pd.concat([df, anar_df], axis=0)

In [32]:
# negative command -- just the subjunctive present
neg_cmd_df = df.loc[(df.mood=='S')&(df.tense=='P')]
neg_cmd_df['pos_neg_cmd'] = 'neg'
neg_cmd_df['mood'] = 'M'
neg_cmd_df['tense'] = '0'

# add positive / neg designations for commands
df['pos_neg_cmd'] = '0'
df.loc[df.mood=='M', 'pos_neg_cmd'] = 'pos'

# now add 
df = pd.concat([df, neg_cmd_df], axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [33]:
# other synthetic tenses?
# plusquamperfet, passat anterior, passat anterior preifrastic, 
# futur perfet... etc.

In [34]:
df.loc[(df.mood=='S')&(df.tense=='P')].sort_values(by=['mood', 'person', 'tense', 'sing_or_plural'])

Unnamed: 0,conj_verb,inf_verb,code,region,mood,gender,tense,person,sing_or_plural,haver_verb,anar_verb,pos_neg_cmd
663519,fem,fer,VMSP1P00,central,S,0,P,1,P,,,0
656143,faci,fer,VMSP1S0Y,central,S,0,P,1,S,,,0
666836,feu,fer,VMSP2P00,central,S,0,P,2,P,,,0
656280,facis,fer,VMSP2S0Y,central,S,0,P,2,S,,,0
656253,facin,fer,VMSP3P0Y,central,S,0,P,3,P,,,0
656144,faci,fer,VMSP3S0Y,central,S,0,P,3,S,,,0


In [35]:
df.to_csv('fer_parsed.tsv', sep='\t', index=False)
