## Syllable count attempt using spaCy syllables

In [18]:
import spacy
from spacy_syllables import SpacySyllables

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("syllables", after="tagger")

assert nlp.pipe_names == ["tok2vec", "tagger", "syllables", "parser",  "attribute_ruler", "lemmatizer", "ner"]

count = 0 
text = ""
with open("sentences.txt", "r") as file:
    for line in file:
        # Split the sentence into the text and syllable count
        sentence, syllable_count = line.rsplit("(", 1)
        syllable_count = syllable_count.replace(" syllables)", "")
        text += sentence
        count += int(syllable_count)

# Print the results
print(text)

doc = nlp(text)
data = [(token.text, token._.syllables, token._.syllables_count) for token in doc]

sum = 0
for el in data:
    if el[2] is not None:
        sum += el[2]

She enjoys reading books and watching movies in her free time. I like to eat pizza and pasta. She sings and plays the guitar beautifully. He ran, jumped, and landed gracefully. My favorite colors are blue, green, and purple. They laughed, talked, and danced all night long. The movie was funny, exciting, and romantic. She walked her dog and went for a jog in the park. The cake was moist, flavorful, and delicious. He worked hard, studied diligently, and earned his degree. The sun was setting, the sky was pink, and the air was warm. She wore a red dress and matching heels to the party. He wrote, directed, and produced the award-winning film. They swam, snorkeled, and sunbathed on the beach. We listened to music, chatted, and had a drink at the bar. She loves to read and write stories. He worked hard and earned a promotion. The dog barked and wagged its tail excitedly. We can go for a walk or watch a movie. She's talented and hardworking, a rare combination. They played soccer and basketba

In [19]:
print("Syllable count according to GPT: ", count) 
print("Syllable count according to spaCy: ", sum)
print("Proportion spaCy/GPT: ", round(sum/count, 2))

Syllable count according to GPT:  584
Syllable count according to spaCy:  461
Proportion spaCy/GPT:  0.79


---

## Parsing a batch of COCA corpus

In [None]:
import re

coca = ""
with open("coca-samples/text_acad.txt", "r") as file:
    for i in range(5):
        coca += file.readline()

regex_patterns = [r"@@\d{7}\b", r"@\s?(@\s?){8}@", r"<[ph]>"]
clean_coca = coca
for regex in regex_patterns:
    clean_coca = re.sub(regex, "", clean_coca)

print(clean_coca)

In [None]:
from trankit import Pipeline

p = Pipeline("english", gpu=False)

In [None]:
all = p(clean_coca)

In [None]:
all["sentences"][0]

In [None]:
with open("connllu.txt", "w") as file:
    for sentence in all["sentences"]:
        id = sentence["id"]
        text = sentence["text"]
        file.write(f"# sent_id = {id}\n")
        file.write(f"# text = {text}\n")

        for token in sentence["tokens"]:
            conllu_token = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
            token["id"],  # ID
            token["text"],  # FORM
            token["lemma"],  # LEMMA
            token["upos"],  # UPOS
            token["xpos"],  # XPOS
            "_",  # FEATS
            token["head"],  # HEAD
            token["deprel"],  # DEPREL
            "_",  # DEPS
            "_"  # MISC
            )
            file.write(f"{conllu_token}\n")
        
        file.write("\n")

## Extract output to XML and retrieve it

In [None]:
from dict2xml import dict2xml
xml = dict2xml(all, wrap='root')

with open("coca_parsed/coca_sample_batch_1.xml", "w") as file:
    file.write(xml)


In [None]:
import xmltodict

with open("coca_parsed/coca_sample_batch_1.xml", "r", encoding='utf-8') as file:
    xml = file.read()

dict = xmltodict.parse(xml)

In [None]:
parsed = dict["root"]
parsed["sentences"]

## Table with info

In [None]:
import pandas as pd

data = pd.read_csv("columns.csv", sep="\t")
df = pd.DataFrame(columns=data.columns)

In [None]:
sentences = all["sentences"]

info = {"sentences": []}

sent_id = 0

for sentence in sentences:

    doc = nlp(sentence["text"])
    syllable_count = 0
    for token in doc:
        if token._.syllables_count is not None:
            syllable_count += token._.syllables_count

    coordination = []
    current_sent = ""

    info["sentences"].append({"sent_id": sent_id + 1})
    info["sentences"][sent_id]["all_syllables"] = syllable_count
    info["sentences"][sent_id]["coordination"] = {}
    info["sentences"][sent_id]["coordination"]["conj_id"] = []
    info["sentences"][sent_id]["coordination"]["conj_word"] = []
    info["sentences"][sent_id]["coordination"]["parts_syllables"] = []
    info["sentences"][sent_id]["coordination"]["parts_tokens"] = []

    token_id = 0
    token_count = 0
    for token in sentence["tokens"]:
        token_id += 1
        token_count += 1
        if token["head"] == 0:
            info["sentences"][sent_id]["head_id"] = token_id
            info["sentences"][sent_id]["head_word"] = token["text"]
            info["sentences"][sent_id]["head_tag"] = token["upos"]
            try:
                info["sentences"][sent_id]["governor.ms"] = token["feats"]
            except KeyError:
                info["sentences"][sent_id]["governor.ms"] = " "

        if token["upos"] == "CCONJ":
            info["sentences"][sent_id]["coordination"]["conj_id"].append(token_id)
            info["sentences"][sent_id]["coordination"]["conj_word"].append(token["text"])
            coordination.append(current_sent)
            current_sent = ""
            info["sentences"][sent_id]["coordination"]["parts_tokens"].append(token_count - 1) 
            token_count = 0
        else:
            if token["text"] == ".":
                current_sent += token["text"]
            else:
                current_sent += token["text"] + " "
    coordination.append(current_sent)
    info["sentences"][sent_id]["coordination"]["parts_tokens"].append(token_count) 
    info["sentences"][sent_id]["coordination"]["coordination_text_parts"] = coordination

    for part in info["sentences"][sent_id]["coordination"]["coordination_text_parts"]:
        syllable_count_part = 0
        doc = nlp(part)
        for token in doc:
            if token._.syllables_count is not None:
                syllable_count_part += token._.syllables_count
        info["sentences"][sent_id]["coordination"]["parts_syllables"].append(syllable_count_part)

    sent_id += 1

In [20]:
info["sentences"]

[{'sent_id': 1,
  'all_syllables': 81,
  'coordination': {'conj_id': [],
   'conj_word': [],
   'parts_syllables': [81],
   'parts_tokens': [70],
   'coordination_text_parts': ['I think it is safe to say that ours is the only dining room in West Los Angeles on whose table -- an eight - foot - long , two - hundred - pound behemoth on which I have taken my meals for many years -- rest piles of photocopies of articles on suicide , all of which were printed in the Encyclopaedia Britannica over the past 220 years .']},
  'head_id': 2,
  'head_word': 'think',
  'head_tag': 'VERB',
  'governor.ms': 'Mood=Ind|Tense=Pres|VerbForm=Fin'},
 {'sent_id': 2,
  'all_syllables': 21,
  'coordination': {'conj_id': [],
   'conj_word': [],
   'parts_syllables': [21],
   'parts_tokens': [14],
   'coordination_text_parts': ['They represent the convergence of two crucially important strands in my own life .']},
  'head_id': 2,
  'head_word': 'represent',
  'head_tag': 'VERB',
  'governor.ms': 'Mood=Ind|Tense=

In [None]:
conj_one_only = []
for infor in info["sentences"]:
    if len(infor["coordination"]["conj_id"]) == 1:
        conj_one_only.append(infor["sent_id"])

In [None]:
print(conj_one_only)

In [None]:
with open("connllu_conj_one.txt", "w") as file:
    for id_conj in conj_one_only:
        sentence = all["sentences"][id_conj-1]
        id = sentence["id"]
        text = sentence["text"]
        file.write(f"# sent_id = {id}\n")
        file.write(f"# text = {text}\n")

        for token in sentence["tokens"]:
            conllu_token = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
            token["id"],  # ID
            token["text"],  # FORM
            token["lemma"],  # LEMMA
            token["upos"],  # UPOS
            token["xpos"],  # XPOS
            "_",  # FEATS
            token["head"],  # HEAD
            token["deprel"],  # DEPREL
            "_",  # DEPS
            "_"  # MISC
            )
            file.write(f"{conllu_token}\n")
        
        file.write("\n")

---


In [None]:
def create_rows_from_parser(parsed_dict, columns_info):
   df = columns_info
   sentences = parsed_dict["sentences"]
   for sentence in sentences:
      list = ['governor.position', 'governor.word', 'governor.nkjp.tag',
              'governor.pos', 'governor.ms', 'conjunction.word',
              'conjunction.nkjp.tag', 'conjunction.pos', 'conjunction.ms',
              sentence["coordination"]["coordination_text_parts"][0], '1.dep.label', '1.head.word', '1.head.nkjp.tag',
              '1.head.pos', '1.head.ms', '1.words', sentence["coordination"]["parts_tokens"][0], '1.chars',
              sentence["coordination"]["coordination_text_parts"][1], '2.dep.label', '2.head.word', '2.head.nkjp.tag',
              '2.head.pos', '2.head.ms', '2.words', sentence["coordination"]["parts_tokens"][0], '2.chars', "text",
              sentence["sent_id"], 'genre', 'converted.from.file'] 
      df.loc[len(df)] = list
      return df

In [None]:
done = create_rows_from_parser(info, df)

In [None]:
done

In [26]:
import pandas as pd
class CSVInfo:
    def __init__(self, template):
        self.template = template
        self.csv = pd.DataFrame(columns=template.columns)
    
    def add_info_row(self, info):
        self.csv.loc[len(self.csv)] = info



In [31]:
def create_csv_template(columns_info_path):
    template = pd.read_csv(columns_info_path)
    return template

In [33]:
template = create_csv_template("/home/adglo/nlp/csv/UD_Polish-LFG.csv")
info_csv = CSVInfo(template)

In [34]:
info_csv.csv

Unnamed: 0,governor.position,governor.word,governor.tag,governor.pos,governor.ms,conjunction.word,conjunction.tag,conjunction.pos,conjunction.ms,no.conjuncts,...,R.head.pos,R.head.ms,R.words,R.tokens,R.syllables,R.chars,sentence,sent_id,genre,converted.from.file


In [35]:
line = [" ", " ", " ",
                    " ", " ", ["and", "and"],
                    " ", " ", " ",
                    " ", " ", " ",
                    " ", " ", " ",
                    " ", " ", " ",
                    " ", " ", " ",
                    " ", " ", " ",
                    " ", " ", " ",
                    " ", " ", " ",
                    " ", " ", " ", 
                    " "
                    ]


In [38]:
info_csv.add_info_row(line)
len(info_csv.csv.columns)

34

In [40]:
info_csv.csv

Unnamed: 0,governor.position,governor.word,governor.tag,governor.pos,governor.ms,conjunction.word,conjunction.tag,conjunction.pos,conjunction.ms,no.conjuncts,...,R.head.pos,R.head.ms,R.words,R.tokens,R.syllables,R.chars,sentence,sent_id,genre,converted.from.file
0,,,,,,"[and, and]",,,,,...,,,,,,,,,,


271, 385, 418, 465, 532, 59