### Assignment-01
Jiaying Yao

### Question 1 & 2
#### A rule-based chatbot supports both Chinese and English.

In [1]:
import random
import jieba
import re

In [2]:
# general utility functions
def is_variable(pattern):
    """Check if the pattern is a single variable"""
    return pattern.startswith('?') and all(a.isalpha() for a in pattern[1:])

def is_pattern_segment(pattern):
    """Check if the pattern is a segment variable"""
    return pattern.startswith('?*') and all(a.isalpha() for a in pattern[2:]) 

def pat_to_dict(patterns):
    """Transform patterns into dictionarys.
    Args:
        patterns: List[Tuple]
    Returns:
        Dict: key is the variable, value is the corresponding segments.
    """
    return {k: ' '.join(v) if isinstance(v, list) else v for k, v in patterns}

def substitute(rule, parsed_rules):
    """Substitute the variables using `parsed_rules`.
    Args:
        rule: str, containing variables
        parsed_rules: Dict
    Returns:
        str
    """
    if not rule: return []
    return [parsed_rules.get(rule[0], rule[0])] + substitute(rule[1:], parsed_rules)

# special utility functions for Chinese
from zhon.hanzi import punctuation
def is_chinese(uchar):
    """Check if an unicode is a Chinese character"""
    if (uchar >= u'\u4e00' and uchar<=u'\u9fff') or (uchar in punctuation):
        return True
    else:
        return False
    
def tokenize_sentence(s):
    """Cut a Chinese sentence into words separated by single space."""
    s_cut = " ".join(jieba.cut(s))
    return concat_wildcard(s_cut)

def concat_wildcard(s):
    """Remove the whitespace between wildcard characters after word segmentation."""
    pattern_seg = r"\? \* "
    pattern_whole = r"\? "
    return re.sub(pattern_whole, "?", re.sub(pattern_seg, "?*", s))

#### Segment Match

In [3]:
fail = [False]
def pat_match_with_seg(pattern, saying):
    
    if not pattern:
        if not saying:
            return []
        else:
            return fail
    elif not all(is_pattern_segment(pat) for pat in pattern) and not saying:
        return fail
    
    pat = pattern[0]
    
    if is_variable(pat):
        return [(pat, saying[0])] + pat_match_with_seg(pattern[1:], saying[1:])
    elif is_pattern_segment(pat):
        match, index = segment_match(pattern, saying)
        if index == -1:
            return fail
        else:            
            return [match] + pat_match_with_seg(pattern[1:], saying[index:])
    elif pat == saying[0]:
        return pat_match_with_seg(pattern[1:], saying[1:])
    else:
        return fail # fail
    
def segment_match(pattern, saying):
    """Find the maximum segment in `saying` that matches pattern."""
    seg_pat, rest = pattern[0], pattern[1:]
    seg_pat = seg_pat.replace('?*', '?')
    
    if not rest: return (seg_pat, saying), len(saying)    
    
    for i, token in enumerate(saying):
        if rest[0] == token:
            return (seg_pat, saying[:i]), i
    
    return (), -1 # 

def is_match(rest, saying):
    if not rest and not saying:
        return True
    # what's the point of this condition? Fuzzy matching?
    if not all(a.isalpha() for a in rest[0]):
        return True
    if rest[0] != saying[0]:
        return False
    return is_match(rest[1:], saying[1:])

#### English

In [4]:
def get_response_en(saying, rules):
    """English chatbot
    Input: 
        saying: str
        rules: dict, if not specified, the response
               would always be "Thanks!"
    Output: str
    """
    for pat in rules:
        parsed_pattern = pat_match_with_seg(pat.split(), saying.split())
        if False not in parsed_pattern:
            ans_pattern = random.choice(rules[pat])
            dic = pat_to_dict(parsed_pattern)
            return " ".join(substitute(ans_pattern.split(), dic))
    else:
        return "Sorry, I don't understand what you are talking about!"

#### Chinese

In [5]:
def get_response_ch(saying, rules):
    """"Chinese chatbot
    Args: 
        saying: str
        rules: dict, if not specified, the response
               would always be "Thanks!"
    Returns: 
        str
    """
    # tokenize saying
    saying_token = tokenize_sentence(saying).split()
    for pat in rules:
        pat_token = tokenize_sentence(pat).split()
        parsed_pattern = pat_match_with_seg(pat_token, saying_token)
        if False not in parsed_pattern:
            ans_pattern = random.choice(rules[pat])
            ans_pattern_token = tokenize_sentence(ans_pattern).split()
            dic = pat_to_dict(parsed_pattern)
            return "".join(substitute(ans_pattern_token, dic))
    else:
        return "抱歉，我不明白你在说什么。"

#### Interface

In [6]:
def get_response(saying, rules={"?*x": ["Thanks!"]}, lang='ch'):
    """Interface for a chatbot supporting both English and Chinese."""
    if lang == 'ch':
        return get_response_ch(saying, rules=rules)
    elif lang == 'en':
        return get_response_en(saying, rules=rules)

#### Test cases

In [7]:
rules = {
    '?*x hello ?*y': ['How do you do', 'Please state your problem'],
    '?*x I want ?*y': ['what would it mean if you got ?y', 'Why do you want ?y', 'Suppose you got ?y soon'],
    '?*x if ?*y': ['Do you really think its likely that ?y', 'Do you wish that ?y', 'What do you think about ?y', 'Really-- if ?y'],
    '?*x no ?*y': ['why not?', 'You are being a negative', 'Are you saying \'No\' just to be negative?'],
    '?*x I was ?*y': ['Were you really', 'Perhaps I already knew you were ?y', 'Why do you tell me you were ?y now?'],
    '?*x I feel ?*y': ['Do you often feel ?y ?', 'What other feelings do you have?'],
    '?*x你好?*y': ['你好呀', '请告诉我你的问题'],
    '?*x我想?*y': ['你觉得?y有什么意义呢？', '为什么你想?y', '你可以想想你很快就可以?y了'],
    '?*x我想要?*y': ['?x想问你，你觉得?y有什么意义呢？', '为什么你想?y', '?x觉得，你可以想想你很快就可以有?y了', '你看?x像?y不', '我看你就像?y'],
    '?*x喜欢?*y': ['喜欢?y的哪里？', '?y有什么好的呢？', '你想要?y吗？'],
    '?*x讨厌?*y': ['?y怎么会那么讨厌呢？', '讨厌?y的哪里？', '?y有什么不好呢？', '你不想要?y吗？'],
    '?*xAI?*y': ['你为什么要提AI的事情？', '你为什么觉得AI要解决你的问题？'],
    '?*x机器人?*y': ['你为什么要提机器人的事情？', '你为什么觉得机器人要解决你的问题？'],
    '?*x对不起?*y': ['不用道歉', '你为什么觉得你需要道歉呢？'],
    '?*x我记得?*y': ['你经常会想起这个吗？', '除了?y你还会想起什么吗？', '你为什么和我提起?y'],
    '?*x如果?*y': ['你真的觉得?y会发生吗？', '你希望?y吗？', '真的吗？如果?y的话', '关于?y你怎么想？'],
    '?*x我?*z梦见?*y':['真的吗？ --- ?y', '你在醒着的时候，以前想象过?y吗？', '你以前梦见过?y吗'],
    '?*x妈妈?*y': ['你家里除了?y还有谁？', '嗯嗯，多说一点和你家里有关系的', '她对你影响很大吗？'],
    '?*x爸爸?*y': ['你家里除了?y还有谁？', '嗯嗯，多说一点和你家里有关系的', '他对你影响很大吗？', '每当你想起你爸爸的时候，你还会想起其他的吗？'],
    '?*x我愿意?*y': ['我可以帮你?y吗？', '你可以解释一下，为什么想?y'],
    '?*x我很难过，因为?*y': ['我听到你这么说，也很难过', '?y不应该让你这么难过的'],
    '?*x难过?*y': ['我听到你这么说，也很难过',
                 '不应该让你这么难过的，你觉得你拥有什么，就会不难过？',
                 '你觉得事情变成什么样，你就不难过了？'],
    '?*x就像?*y': ['你觉得?x和?y有什么相似性？', '?x和?y真的有关系吗？', '怎么说？'],
    '?*x和?*y都?*z': ['你觉得?z有什么问题吗？', '?z会对你有什么影响呢？'],
    '?*x和?*y一样?*z': ['你觉得?z有什么问题吗？', '?z会对你有什么影响呢？'],
    '?*x我是?*y': ['真的吗？', '?x想告诉你，或许我早就知道你是?y', '你为什么现在才告诉我你是?y'],
    '?*x我是?*y吗': ['如果你是?y会怎么样呢？', '你觉得你是?y吗', '如果你是?y，那一位着什么？'],
    '?*x你是?*y吗':  ['你为什么会对我是不是?y感兴趣？', '那你希望我是?y吗', '你要是喜欢，我就会是?y'],
    '?*x你是?*y' : ['为什么你觉得我是?y'],
    '?*x因为?*y' : ['?y是真正的原因吗？', '你觉得会有其他原因吗？'],
    '?*x我不能?*y': ['你或许现在就能?y', '如果你能?y,会怎样呢？'],
    '?*x我觉得?*y': ['你经常这样感觉吗？', '除了到这个，你还有什么其他的感觉吗？'],
    '?*x我?*y你?*z': ['其实很有可能我们互相?y'],
    '?*x你为什么不?*y': ['你自己为什么不?y', '你觉得我不会?y', '等我心情好了，我就?y'],
    '?*x好的?*y': ['好的', '你是一个很正能量的人'],
    '?*x嗯嗯?*y': ['好的', '你是一个很正能量的人'],
    '?*x不嘛?*y': ['为什么不？', '你有一点负能量', '你说 不，是想表达不想的意思吗？'],
    '?*x不要?*y': ['为什么不？', '你有一点负能量', '你说 不，是想表达不想的意思吗？'],
    '?*x有些人?*y': ['具体是哪些人呢？'],
    '?*x有的人?*y': ['具体是哪些人呢？'],
    '?*x某些人?*y': ['具体是哪些人呢？'],
    '?*x每个人?*y': ['我确定不是人人都是', '你能想到一点特殊情况吗？', '例如谁？', '你看到的其实只是一小部分人'],
    '?*x所有人?*y': ['我确定不是人人都是', '你能想到一点特殊情况吗？', '例如谁？', '你看到的其实只是一小部分人'],
    '?*x总是?*y': ['你能想到一些其他情况吗？', '例如什么时候？', '你具体是说哪一次？', '真的---总是吗？'],
    '?*x一直?*y': ['你能想到一些其他情况吗？', '例如什么时候？', '你具体是说哪一次？', '真的---总是吗？'],
    '?*x或许?*y': ['你看起来不太确定'],
    '?*x可能?*y': ['你看起来不太确定'],
    '?*x他们是?*y吗？': ['你觉得他们可能不是?y？'],
    # '?*x': ['很有趣', '请继续', '我不太确定我很理解你说的, 能稍微详细解释一下吗?']
}

In [8]:
print(get_response("你好", rules))
print(get_response("我想要娃娃", rules))
print(get_response("我是人吗", rules))
print(get_response("我和你一样吗", rules))
print(get_response("hello", rules, lang='en'))
print(get_response("I feel frustrated", rules, lang='en'))

print()
# rule not exist
print(get_response("再见", rules))
print(get_response("I need an   iPhone", rules, lang='en'))

print()
# default rule
print(get_response("excuse me?", lang='en'))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/vx/ldqqvh1x50b66d17k1rynyn00000gn/T/jieba.cache
Loading model cost 0.591 seconds.
Prefix dict has been built succesfully.


请告诉我你的问题
想问你，你觉得娃娃有什么意义呢？
你为什么现在才告诉我你是人 吗
你觉得吗有什么问题吗？
Please state your problem
Do you often feel frustrated ?

抱歉，我不明白你在说什么。
Sorry, I don't understand what you are talking about!

Thanks!


### Question 4

1. 这样的程序有什么优点？有什么缺点？你有什么可以改进的方法吗？ 
    * Advantages：
       * Rule-based, guaranteed to give meaningful answers. 
       * Simple to prototype, don't have "cold start" problem, easy to add/delete rules.
       * Such chatbots can have buttons, carousels and other interactive functions, not restricted to text     interactions. 
    * Disadvantages:
       * Can only handle predefined quetion patterns, can't learn from interactions by itself, not flexible.
       * When the application scenario is complicated, the rules can be difficult, if not impossible, to design.
       * For this specific implementation, it's not efficient, need to search the entire rule space for pattern-matching. 
       * For the same question, it randomly chooese an answer which may not be the most appropriate one.
    * Improvements:
       * Use a syntax tree to store the rules, extract keywords from saying, speed up pattern-matching and search.
   
   
2. 什么是数据驱动？数据驱动在这个程序里如何体现？  
Data driven is a way of thinking, it refers to a process or activity that is spurred on by data, as opposed to being driven by mere intuition or personal experience. From the perspective of programming, data-driven is about writing as little fixed code as possible. Take our rule-based chatbot for example, in general, it does not follow the principle of data-driven because all the rules are predefined. But if we adjust the rules based on interactions with users, the data, then this process can be considered as data-driven.


3. 数据驱动与 AI 的关系是什么？   
First we need to know what is AI? There are a lot of different definitions of AI, in [Merriam-Webster Dictionary](https://www.merriam-webster.com/dictionary/artificial%20intelligence), AI is defined as the capability of a machine to imitate intelligent human behavior. Generally, there are two kinds of AI, model-driven AI and data-driven AI. 
Model-driven AI attempts to capture knowledge and derive decisions through explicit representation and rules. The data-driven way focusses on building a system that can identify what is the right answer based on having “seen” a large number of examples of question / answer pairs and “training” it to get to the right answer. Machine learning belongs to data-driven AI and data-driven AI has become the trend of AI due to the rise of Big Data and computional power. 

Reference: [Building AI software: Data-driven vs model-driven AI and why we need an AI-specific software development paradigm](https://hackernoon.com/building-ai-software-data-driven-vs-model-driven-ai-and-why-we-need-an-ai-specific-software-640f74aaf78f)