# CONLL to Sketch

## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

## Preview input CONLL file

In [None]:
conllu_file = '/content/drive/Shareddrives/AppLingLT/2024/SketchEngine/ari.conllu' # Google Drive path to CONLL/U file

with open(conllu_file, 'r', encoding='utf-8') as file:
  content = file.read()
  print(content)


# Columns: id, form, lemma, upos, xpos, feats, head, deprel, deps, misc

## Convert input CONLL to VERT and preview output


<!--
Reference: https://www.sketchengine.eu/documentation/building-sketches-from-parsed-corpora/

Transform, e.g.:

```
id  form  lemma upos  xpos  feats head  deprel
==============================================

# sent_id = 1
# text = Il gatto nero corre veloce.
1	Il	il	DET	_	Gender=Masc|Number=Sing	2	det	_	_
2	gatto	gatto	NOUN	_	Gender=Masc|Number=Sing	3	nsubj	_	_
3	nero	nero	ADJ	_	Gender=Masc|Number=Sing	2	amod	_	_
4	corre	correre	VERB	_	Number=Sing|Person=3|Tense=Pres|Mood=Ind	0	root	_	_
5	veloce	veloce	ADJ	_	Gender=Masc|Number=Sing	4	advmod	_	_
6	.	.	PUNCT	_	_	4	punct	_	_

# sent_id = 2
# text = Lei legge un libro interessante.
1	Lei	lei	PRON	_	Case=Nom|Number=Sing|Person=3	2	nsubj	_	_
2	legge	leggere	VERB	_	Number=Sing|Person=3|Tense=Pres|Mood=Ind	0	root	_	_
3	un	un	DET	_	Gender=Masc|Number=Sing	4	det	_	_
4	libro	libro	NOUN	_	Gender=Masc|Number=Sing	2	obj	_	_
5	interessante	interessante	ADJ	_	Gender=Masc|Number=Sing	4	amod	_	_
6	.	.	PUNCT	_	_	2	punct	_	_
```

Into, e.g.:

```
id   word(form)  lempos(lemma)  tag(upos)  fineTag(feats) head deprel
======================================================================

<s id="1">
1	Il	il	DET	Gender=Masc|Number=Sing	2	det
2	gatto	gatto	NOUN	Gender=Masc|Number=Sing	3	nsubj
3	nero	nero	ADJ	Gender=Masc|Number=Sing	2	amod
4	corre	correre	VERB	Number=Sing|Person=3|Tense=Pres|Mood=Ind	0	root
5	veloce	veloce	ADJ	Gender=Masc|Number=Sing	4	advmod
6	.	.	PUNCT	_	4	punct
</s>


<s id="2">
# text = Lei legge un libro interessante.
1	Lei	lei	PRON	Case=Nom|Number=Sing|Person=3	2	nsubj
2	legge	leggere	VERB	Number=Sing|Person=3|Tense=Pres|Mood=Ind	0	root
3	un	un	DET	Gender=Masc|Number=Sing	4	det
4	libro	libro	NOUN	Gender=Masc|Number=Sing	2	obj
5	interessante	interessante	ADJ	Gender=Masc|Number=Sing	4	amod
6	.	.	PUNCT	_	2	punct
</s>
```

The only mandatory columns which are essential for generating word sketches are:
- `id`: this represents the id/position of the current word. The name of this column can be overridden by setting the IDATTR configuration directive.
- One positional attribute (probably `word`, `lemma` or `lempos`) used for generating the sketches. The name of this attribute is set by WSATTR configuration directive.
- `head`: this represents the parent node id of the current word. The name of this column can be overridden by setting the HEADATTR configuration directive.
- `deprel`: this represents the relation by which the current node and parent node are connected. The name of this column can be overridden by setting the DEPRELATTR configuration directive.
-->


### Convert one file only

In [None]:
# Define the function to transform CoNLL-U to the specified format
def transform_conllu_to_vert(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        sentence_id = None  # initialise sentence tracking

        # Write the opening <doc> tag
        outfile.write('<doc>\n') # Add metadata to <doc> if/as needed, e.g., '<doc id="N" date="2024">\n'

        for line in infile: # interate over each line in the file, stripping whitespace from each line to prep it for transformation
            line = line.strip()

            # Check if the line indicates a new sentence
            if line.startswith("# sent_id"):
                sentence_id = line.split('=')[1].strip()  # Extract the sentence ID
                outfile.write(f'<s id="{sentence_id}">\n')

            elif line.startswith("#"):
                # Skip all other lines starting with '#'
                continue

            elif line:
                # Process each word line (non-empty, non-comment line)
                fields = line.split('\t')
                word_data = [
                    #fields[0],       # ID
                    fields[1],       # FORM
                    fields[3],       # UPOS
                    fields[2],       # LEMMA
                    #fields[4],       # XPOS
                    fields[5],       # FEATS
                    #fields[6],       # HEAD
                    fields[7]        # DEPREL
                ]
                outfile.write("\t".join(word_data) + "\n")

            else:
                # End of the sentence
                if sentence_id:
                    outfile.write("</s>\n")
                    sentence_id = None  # Reset for the next sentence


        # Ensure the last sentence is closed if it was opened
        if sentence_id:
            outfile.write("</s>\n")

        # Write the closing </doc> tag
        outfile.write("</doc>\n")


# Function to preview the contents of the output file
def preview_file(file_path, num_lines=10):
    print(f"Previewing the first {num_lines} lines of {file_path}:\n")
    with open(file_path, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file):
            if i >= num_lines:
                break
            print(line, end="")  # Print each line without adding extra newlines

# Run the function with your file paths
input_file = '/content/drive/Shareddrives/AppLingLT/2024/SketchEngine/ari.conllu'
output_file = '/content/drive/Shareddrives/AppLingLT/2024/SketchEngine/ari.vert'
transform_conllu_to_vert(input_file, output_file)

preview_file(output_file, num_lines=500)

### Convert multiple files

In [None]:
#### UNTESTED!

import os

# Define the function to transform CoNLL-U to the specified format
def transform_conllu_to_vert(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        sentence_id = None  # initialise sentence tracking

        # Write the opening <doc> tag
        outfile.write("<doc>\n")

        for line in infile:  # iterate over each line in the file, stripping whitespace from each line to prep it for transformation
            line = line.strip()

            # Check if the line indicates a new sentence
            if line.startswith("# sent_id"):
                sentence_id = line.split('=')[1].strip()  # Extract the sentence ID
                outfile.write(f'<s id="{sentence_id}">\n')

            elif line.startswith("#"):
                # Skip all other lines starting with '#'
                continue

            elif line:
                # Process each word line (non-empty, non-comment line)
                fields = line.split('\t')
                word_data = [
                    #fields[0],       # ID
                    fields[1],       # FORM
                    fields[3],       # UPOS
                    fields[2],       # LEMMA
                    #fields[4],       # XPOS
                    fields[5],       # FEATS
                    #fields[6],       # HEAD
                    fields[7]        # DEPREL
                ]
                outfile.write("\t".join(word_data) + "\n")

            else:
                # End of the sentence
                if sentence_id:
                    outfile.write("</s>\n")
                    sentence_id = None  # Reset for the next sentence

        # Ensure the last sentence is closed if it was opened
        if sentence_id:
            outfile.write("</s>\n")

        # Write the closing </doc> tag
        outfile.write("</doc>\n")


# Function to preview the contents of the output file
def preview_file(file_path, num_lines=10):
    print(f"Previewing the first {num_lines} lines of {file_path}:\n")
    with open(file_path, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file):
            if i >= num_lines:
                break
            print(line, end="")  # Print each line without adding extra newlines

# Define a list of input files and corresponding output files
input_directory = '/content/drive/Shareddrives/AppLingLT/2024/SketchEngine/'
input_files = [
    'ari.conllu',
    'second_file.conllu',  # Add more files as needed
    'third_file.conllu'
]

# Process each file
for input_file in input_files:
    input_path = os.path.join(input_directory, input_file)
    output_path = os.path.join(input_directory, input_file.replace('.conllu', '.vert'))

    # Transform the file
    transform_conllu_to_vert(input_path, output_path)

    # Preview the output file
    preview_file(output_path, num_lines=50)  # Preview the first 50 lines of each output