In [6]:
import pandas as pd

### 1a. Rules

In [67]:
input_path = "data/"
output_path = "output/grammar.txt"

In [69]:
df_grammar = pd.read_excel(f"{input_path}/sample_grammar_rules.xlsx")
df_grammar.head()

Unnamed: 0.1,Unnamed: 0
0,NP -> ART N
1,NP -> ART ADJ N
2,NP -> ART N PP
3,NP -> ART ADJ ADJ N
4,NP -> DET N


In [70]:
import os
print("Thư mục hiện tại:", os.getcwd())

Thư mục hiện tại: d:\HCMUT\241\CO3085_NLP\assignment1


In [71]:
output_directory = 'output'
os.makedirs(output_directory, exist_ok=True)

output_path = os.path.join(output_directory, 'grammar.txt')

with open(output_path, 'w', encoding='utf-8') as f:
        for index, row in df_grammar.iterrows():
            line = ', '.join(str(value) for value in row) 
            f.write(line + '\n')  

In [72]:
def convert_grammar_format(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    converted_rules = {}
    for line in lines:
        line = line.strip()
        lhs, rhs = line.split('->')
        lhs = lhs.strip()
        rhs_parts = rhs.strip().split()

        if lhs in converted_rules:
            converted_rules[lhs].append(rhs_parts)
        else:
            converted_rules[lhs] = [rhs_parts]

    with open(output_file, 'w', encoding='utf-8') as f:
        for key, value in converted_rules.items():
            f.write(f"'{key}': {value},\n")

input_file = "output/grammar.txt"  # Tên file đầu vào
output_file = "output/output.txt"  # Tên file đầu ra
convert_grammar_format(input_file, output_file)


### lexicon 

In [73]:
df_lexicon = pd.read_excel(f"data/sample_lexicon.xlsx")
df_lexicon.head()

Unnamed: 0.1,Unnamed: 0
0,ART -> the | a | an
1,ADJ -> happy | busy | interested | great | neg...
2,N -> customer | service | product | call | res...
3,V -> call | offer | want | need | say | respon...
4,P -> to | for | in | on | with | about | at


In [74]:
output_path = os.path.join(output_directory, 'lexicon.txt')

with open(output_path, 'w', encoding='utf-8') as f:
        for index, row in df_lexicon.iterrows():
            line = ', '.join(str(value) for value in row) 
            f.write(line + '\n')  

In [75]:
def convert_lexicon_format(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        rules = f.readlines()
    
    converted_lexicons = {}
    for rule in rules:
        lhs, rhs = rule.split('->')
        lhs = lhs.strip()
        rhs = rhs.strip()

        rhs_dict = [item.strip() for item in rhs.split('|')]
        
        if lhs not in converted_lexicons:
                    converted_lexicons[lhs] = []
        
        # Bao bọc mỗi item trong một danh sách
        for item in rhs_dict:
            item = [item]  # Bao bọc item trong một danh sách
            converted_lexicons[lhs].append(item)
        
        
    
    with open(output_file, 'a', encoding='utf-8') as f:
        for key, value in converted_lexicons.items():
            f.write(f"'{key}': {value},\n")


input_file = "output/lexicon.txt"  # Tên file đầu vào
output_file = "output/output.txt"  # Tên file đầu ra
convert_lexicon_format(input_file, output_file)

In [58]:
import random

# Định nghĩa ngữ pháp bằng từ điển
grammar = {
    'S': [['NP', 'VP']],
    'NP': [['Det', 'N'], ['Det', 'N', 'PP']],
    'VP': [['V', 'NP'], ['VP', 'PP']],
    'PP': [['P', 'NP']],
    'Det': [['the'], ['a']],
    'N': [['dog'], ['cat'], ['treat']],
    'V': [['chased'], ['saw'], ['liked']],
    'P': [['in'], ['on'], ['by']],
}

# Hàm sinh câu từ ngữ pháp
def generate_sentence(symbol):
    if symbol not in grammar:
        return symbol  # Nếu là ký hiệu terminal, trả về ngay
    
    production = random.choice(grammar[symbol])  # Chọn một quy tắc sản xuất ngẫu nhiên
    result = []
    for part in production:
        result.append(generate_sentence(part))  # Đệ quy cho mỗi phần
    return ' '.join(result)  # Kết hợp các phần lại thành một câu

# Hàm phân tích cú pháp
def parse_sentence(sentence):
    words = sentence.split()
    return parse_helper(words, 'S')

def parse_helper(words, symbol):
    if not words:
        return False
    
    if symbol not in grammar:
        if words[0] == symbol:
            return words[1:]  # Trả về danh sách từ còn lại
        else:
            return False

    for production in grammar[symbol]:
        remaining_words = words
        for part in production:
            remaining_words = parse_helper(remaining_words, part)
            if remaining_words is False:
                break
        if remaining_words is not False:
            return remaining_words  # Trả về danh sách từ còn lại

    return False

# Ví dụ sử dụng
print("Sinh câu:")
for _ in range(5):
    print(generate_sentence('S'))

print("\nPhân tích cú pháp câu:")
sentence = "the dog chased a cat"
if parse_sentence(sentence):
    print(f"Câu '{sentence}' được phân tích cú pháp thành công.")
else:
    print(f"Câu '{sentence}' không hợp lệ.")


Sinh câu:
the cat liked the treat in the dog by the treat in a cat on a cat on a dog in a cat on the treat in a cat in a treat by the cat on a treat on a cat in the cat on the cat in a cat in the dog by a dog in a cat in a dog in a dog on a dog by a cat on the treat on a cat by the cat in a cat on a cat on the dog in the cat in a treat
a treat saw the cat in the cat
a cat saw a treat
the dog in the treat saw a cat
a cat chased a dog

Phân tích cú pháp câu:
Câu 'the dog chased a cat' không hợp lệ.


### 1b. Sentences generation algorithms

In [76]:
import random
import ast

In [77]:
grammar_rule = {}

In [78]:
with open("output/output.txt", 'r', encoding='utf-8') as file:
    for line in file:
        key, value = line.strip().split(': ')
        key = key.strip("'")  
        value = ast.literal_eval(value.strip(','))
        
        grammar_rule[key] = value

print(grammar_rule)

{'NP': [['ART', 'N'], ['ART', 'ADJ', 'N'], ['ART', 'N', 'PP'], ['ART', 'ADJ', 'ADJ', 'N'], ['DET', 'N'], ['PRON'], ['QUANT', 'N'], ['QUANT', 'ADJ', 'N'], ['NP', 'P', 'NP'], ['NP', 'CONJ', 'NP'], ['NP', 'PP'], ['PRON', 'P', 'NP'], ['DET', 'ADJ', 'N', 'PP'], ['DET', 'N', 'POS', 'N'], ['PRON', 'POS', 'N'], ['NUM', 'N'], ['NP', 'REL', 'S'], ['ART', 'ADV', 'ADJ', 'N'], ['ADV', 'ADJ', 'N'], ['DET', 'ADV', 'ADJ', 'N']], 'VP': [['V'], ['V', 'NP'], ['V', 'PP'], ['V', 'ADV'], ['MOD', 'V'], ['AUX', 'NEG', 'V'], ['V', 'CONJ', 'V'], ['V', 'S'], ['ADV', 'VP']], 'ADJP': [['ADJ'], ['ADV', 'ADJ'], ['ADJ', 'CONJ', 'ADJ'], ['ADJ', 'PP']], 'ADV': [['quickly'], ['eagerly'], ['positively'], ['negatively'], ['always'], ['never']], 'PP': [['P', 'NP'], ['P', 'PRON'], ['P', 'ADJP'], ['PP', 'CONJ', 'PP'], ['P', 'ADV']], 'S': [['NP', 'VP'], ['NP', 'VP', 'PP'], ['NP', 'VP', 'ADJP'], ['NP', 'VP', 'CONJ', 'S'], ['S', 'CONJ', 'S'], ['S', 'ADJP']], 'QP': [['Q', 'AUX', 'NP'], ['Q', 'V', 'NP']], 'ART': [['the'], ['a'], 

In [80]:
def generate_sentence(symbol):
    if symbol not in grammar_rule:
        return symbol
    
    result = []
    prods = random.choice(grammar_rule[symbol])
    for prod in prods:
        result.append(generate_sentence(prod))
    return ' '.join(result)

In [88]:
output_directory = 'output'
os.makedirs(output_directory, exist_ok=True)

output_path = os.path.join(output_directory, 'samples.txt')
with open("output/sample.txt", 'w', encoding='utf-8') as file:
    for _ in range(10000):
        sentence = generate_sentence('S')
        file.write(f"{sentence}\n")

