In [1]:
# Create a new Jupyter notebook and import pandas. 
import pandas as pd
import numpy as np

In [3]:
# Task 2: Load Basque UD corpus into a DataFrame
import pandas as pd

# Define CoNLL-U field names
field_names = ["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]

# Load the development set into a DataFrame, filtering out empty lines and comments
df = pd.read_csv('eu_bdt-ud-dev.conllu', sep='\t', names=field_names, comment='#', skip_blank_lines=True)

# Task 3: Convert underscore values to appropriate missing data type and remove columns with more than 80% empty values
df.replace('_', pd.NA, inplace=True)
df.dropna(axis=1, thresh=0.8*len(df), inplace=True)

# Task 4: Reduce the dataset to rows representing forms of the auxiliary "izan"
izan_forms = df[df['LEMMA'] == 'izan']

# Task 5: Apply vectorized string operations using regular expressions to convert morphological features
if 'FEATS' in df.columns:
    morphological_features = df['FEATS'].str.extract(r'Number\[([^\]]+)\]=(\w+)').dropna(axis=1, how='all')
    df = pd.concat([df, morphological_features], axis=1)

# Task 6: Query the database for syncretism in the paradigm of "izan"
if 'FEATS' in izan_forms.columns:
    syncretism_check = izan_forms.duplicated(subset=morphological_features.columns, keep=False)
    syncretic_forms = izan_forms[syncretism_check]

# Task 7: Count the occurrences of each form of "izan" in the development set
izan_counts = izan_forms['FORM'].value_counts().reset_index()
izan_counts.columns = ['Form', 'Count']
izan_forms_with_counts = pd.merge(izan_forms, izan_counts, left_on='FORM', right_on='Form')

# Task 8: Split the forms of "izan" into ten bins and remove outliers if necessary
izan_forms_with_counts['Frequency_Bin'] = pd.qcut(izan_forms_with_counts['Count'], q=10, labels=False, duplicates='drop')
izan_forms_no_outliers = izan_forms_with_counts[~izan_forms_with_counts['Count'].isin(izan_forms_with_counts['Count'].value_counts().index[:2])]

# Display the resulting DataFrame
izan_forms_no_outliers

Unnamed: 0,ID,FORM,LEMMA,UPOS,HEAD,DEPREL,Form,Count,Frequency_Bin
490,12,zen,izan,AUX,11,aux,zen,182,5
491,10,zen,izan,AUX,9,aux,zen,182,5
492,5,zen,izan,AUX,4,aux,zen,182,5
493,8,zen,izan,AUX,7,aux,zen,182,5
494,4,zen,izan,AUX,3,aux,zen,182,5
...,...,...,...,...,...,...,...,...,...
1426,9,zitzaien,izan,AUX,8,aux,zitzaien,1,0
1427,5,zinen,izan,AUX,4,aux,zinen,1,0
1428,13,bazara,izan,VERB,6,conj,bazara,1,0
1429,10,izandakoak,izan,AUX,9,cop,izandakoak,1,0
