In [7]:
import nltk 
from nltk import Tree

In [8]:
def tokenize(sentence):
    """
    Tokenize a sentence
    """
    return sentence.split()

In [9]:
def build_parse_tree(tokens):
    """Simple syntax tree manually defined"""
    if not tokens:
        return None
    if len(tokens) == 1:
        return Tree(tokens[0], [])
    root_index = len(tokens) // 2
    root = tokens[root_index]
    lef_subtree = build_parse_tree(tokens[:root_index])
    right_subtree = build_parse_tree(tokens[root_index + 1:])
    children = [child for child in [lef_subtree, right_subtree] if child is not None]
    return Tree(root, children)


In [10]:
sentence = "The quick brown fox jumps over the lazy dog" 
tokens = tokenize(sentence)
print(tokens)

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']


In [11]:
print("The manual syntax tree")
syntax_tree = build_parse_tree(tokens)
syntax_tree.pretty_print()

The manual syntax tree
                jumps              
         _________|_________        
      brown                lazy    
   _____|____           ____|____   
quick        |        the        | 
  |          |         |         |  
 The        fox       over      dog
  |          |         |         |  
 ...        ...       ...       ...



In [12]:

# word roles in the sentence, starting the transformation processes
if len(tokens)>=9:
    roles={
        tokens[0]:"det", tokens[1]:"adj", tokens[2]:"adj", tokens[3]:"nsub",
        tokens[4]:"verb", tokens[5]:"prep", tokens[6]:"det", tokens[7]:"adj", tokens[8]:"obj"

    }
else: 
    roles ={token:"unknown" for token in tokens}    
print("\nWord role mapping:")
for word, role in roles.items():
    print(f"{word}: {role}")


Word role mapping:
The: det
quick: adj
brown: adj
fox: nsub
jumps: verb
over: prep
the: det
lazy: adj
dog: obj


In [19]:
word_list = tokens
word_dict = {i: word for i, word in enumerate(tokens)}
print("Array (index based representation):")
word_dict

for key, value in word_dict.items():
    print(key, value)

Array (index based representation):
0 The
1 quick
2 brown
3 fox
4 jumps
5 over
6 the
7 lazy
8 dog


In [24]:
import pandas as pd 
df = pd.DataFrame({
    'Index':list(word_dict.keys()),
    'Word':list(word_dict.values()),
    "Role": [roles.get(word, "unknown") for word in word_dict.values()]
})
df.head()

Unnamed: 0,Index,Word,Role
0,0,The,det
1,1,quick,adj
2,2,brown,adj
3,3,fox,nsub
4,4,jumps,verb


In [25]:
import stanza
nlp =stanza.Pipeline(lang='en', processors="tokenize,pos")



  from .autonotebook import tqdm as notebook_tqdm
2025-03-24 15:41:56 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 424kB [00:00, 13.4MB/s]                    
2025-03-24 15:41:56 INFO: Downloaded file to /Users/dimtriospanagoulias/stanza_resources/resources.json
2025-03-24 15:41:57 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

2025-03-24 15:41:57 INFO: Using device: cpu
2025-03-24 15:41:57 INFO: Loading: tokenize
2025-03-24 15:41:58 INFO: Loading: mwt
2025-03-24 15:41:58 INFO: Loading: pos
2025-03-24 15:41:59 INFO: Done loading processors!


In [33]:
def stanza_pos_parser(sentence):
    doc = nlp(sentence)
    tokens = [word.text for sent in doc.sentences for word in sent.words]
    pos_tags = [(word.text,word.upos) for sent in doc.sentences for word in sent.words] 

    print("Stanza pipeline tokenization And Part Of Speech Tagging")
    # for token, pos in pos_tags:
    #     print(f"{token}: {pos}")
    return tokens, pos_tags
        

In [34]:
sentence = '''
Building upon Mistral Small 3 (2501), Mistral Small 3.1 (2503) adds state-of-the-art vision understanding and enhances long context capabilities up to 128k tokens without compromising text performance. With 24 billion parameters, this model achieves top-tier capabilities in both text and vision tasks
'''
tokens, pos_tags = stanza_pos_parser(sentence)

Stanza pipeline tokenization And Part Of Speech Tagging


In [35]:
stanza_dataframe = pd.DataFrame({
    'Token': [token for token, pos in pos_tags],
    'POS': [pos for token, pos in pos_tags]
})
stanza_dataframe.head()

Unnamed: 0,Token,POS
0,Building,VERB
1,upon,ADP
2,Mistral,PROPN
3,Small,ADJ
4,3,NUM
