In [None]:

import json
import spacy
import nltk
import string

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from typing import List
from string import punctuation
import matplotlib.pylab as plt
from scipy.stats import wilcoxon
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize


# Text Processing

## TF-IDF

1. Test the script tfidf_demo.ipynb in the Jupiter note and make sure they work.

2. Replace the movie review data "texts" in the script file with your own defined document and test it.

3.  Given the below documents:  
texts = [
    "good movie", "not a good movie", "did not like",
    "i like it", "good one"
]

Given the definition of TF and IDF, what is the sum of TF-IDF values for 1-grams in "good movie" text? Enter a math expression as an answer.

In [None]:

def gen_tfidf(texts, min_df=1, max_df=1, ngram_range=(1, 1)):
    """
    texts: a list of strings
    """

    # using default tokenizer in TfidfVectorizer
    tfidf = TfidfVectorizer(min_df=min_df, max_df=max_df, ngram_range=ngram_range)

    features = tfidf.fit_transform(texts)

    df = pd.DataFrame(
                    features.todense(),
                    columns=tfidf.get_feature_names_out()
                    )
    return df


In [None]:

texts = [
    "good movie", "not a good movie", "did not like",
    "i like it", "good one"
]

df = gen_tfidf(texts, min_df=2, max_df=0.5, ngram_range=(1, 2))

k = "good movie"
print(f"\n1-gram sum('{k}'): {df[k].sum():.4}\n")


df



1-gram sum('good movie'): 1.284



Unnamed: 0,good movie,like,movie,not
0,0.707107,0.0,0.707107,0.0
1,0.57735,0.0,0.57735,0.57735
2,0.0,0.707107,0.0,0.707107
3,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0


In [None]:

test = [
    'a great film', 'great cast', 'a pleasure to watch',
    'not good', 'hard to watch', 'boring film'
]

df = gen_tfidf(test, min_df=2, max_df=0.5, ngram_range=(1, 2))
df


Unnamed: 0,film,great,to,to watch,watch
0,0.707107,0.707107,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.57735,0.57735,0.57735
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.57735,0.57735,0.57735
5,1.0,0.0,0.0,0.0,0.0


To calculate the sum of TF-IDF values for 1-gram frequency of the term "good-movie" in the text:  
t = term, d = set of n docs. containing t, D = set of N total documents  

For a term t_i in T, for a given document d_j in corpus D:   
t_i = "good movie", D = 1, N = 1   
  
TFIDF(t_i,d_j,D) = TF(t_i,d_j)*IDF(t_i,D)  

TF(t_i,d_j) = 1-gram frequency of t_i in d_j, ex.:    
TF(t_i,d_j) =  n(t) / n(d_j)  

IDF(t_i,D) = log(N/| num. d_j|)  

If D = [["good movie", "not a good movie", "did not like",
"i like it", "good one"]],   
D = [[d_0, ... ,d_j]] = [[d_0, d_1, d_2, d_3, d_4]]   
  
For a set of terms T = ["good movie", "like", "movie", "not"],  
if i = 0 and t_i = "good movie",  
  
Using the l2 norm, for a given document:  
$$
||TFIDF(t_{i},d_{j},D)||^2 = \sqrt(\sum_{n=0}^i TFIDF(t_{i},d_{j})^2))
$$
The normalized TFIDF' for the document for a given term, t:  
$$
TFIDF'(t_{i},d_{j},D) = \sum_{n=0}^{i} TFIDF(t_{i},d_{j},D) / ||TFIDF(t_{i},d_{j},D)||^2
$$
  
Since term t does not appear in documents 2,3, or 4:
$$
TFIDF(d_{j} | j = 2,3,4) = 0
$$
The sum of TFIDF values for 1-gram frequency of "good movie":  
$$
\sum_{n=0}^{j}(TFIDF(t_{i},d_{j},D)) = TFIDF(d_{0})/||TFIDF(d_{0})||^2 + TFIDF(d_{1})/||TFIDF(d_{1})||^2  
$$

If we wanted to find the 1-gram frequency of "good" and "movie" within "good movie", we can use the same formulas with a new set of terms, where each term is only 1 word long using the default inputs for the TfidfVectorizer() module:

In [None]:

texts = [
    "good movie", "not a good movie", "did not like",
    "i like it", "good one"
]

df = gen_tfidf(texts, min_df=2, max_df=3)



print(f"\n1-gram sum(good) = {df['good'].sum():.4}")
print(f"\n1-gram sum(movie) = {df['movie'].sum():.4}\n")

df



1-gram sum(good) = 2.145

1-gram sum(movie) = 1.379



Unnamed: 0,good,like,movie,not
0,0.638711,0.0,0.769447,0.0
1,0.506204,0.0,0.609818,0.609818
2,0.0,0.707107,0.0,0.707107
3,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0


## Tokenizer

In [None]:

def tokenize(text):
    """Tokenizes a sentence (corpus) to build a frequency dictionary.
    * accepts a sentence (i.e., `text` parameter) as an input
    * splits the sentence into a list of tokens by **space** (including tab, and new line).
        - e.g., `it's a hello world!!!` will be split into tokens `["it's", "a","hello","world!!!"]`
    * removes the **leading/trailing punctuations or spaces** of each token, if any
        - e.g., `world!!! -> world`, while `it's` does not change
        - hint, you can import module *string*, use `string.punctuation` to get a list of punctuations (say `puncts`), and then use function `strip(puncts)` to remove leading or trailing punctuations in each token
    * only keeps tokens with 2 or more characters, i.e. `len(token)>1`
    * converts all tokens into lower case
    * find the count of each unique token and save the counts as dictionary, i.e., `{world: 1, a: 1, ...}`
    * returns the dictionary
    :param str text: sentence (corpus)

    :return dict: corpus vocabulary
    """

    punct = punctuation + '\u201c\u201d\u2018\u2019'

    # Remove any leading or trailing punctuation and spaces/tabs/new lines from each
    tokens_clean = [t.strip(punct) for t in text.split()]

    # If a word is longer than 2 letters, covert to lower case and keep
    tokens = [t.lower() for t in tokens_clean if len(t) > 1]

    # Store word count in vocab dict
    vocab = {}
    for t in tokens:
        if t not in vocab:
            vocab[t] = 1.0
        else:
            vocab[t] += 1.0

    return vocab

def tokenize_a_doc(doc, nlp, lemmatized=True, remove_stopword=True, remove_punct=True):
    clean_tokens = []
    # load current doc into spacy nlp model and split sentences by newline chars
    sentences = doc.split("\\n")
    for sentence in sentences:
        doc = nlp(sentence)

        # clean either lemmatized unigrams or unmodified doc tokens
        if lemmatized:
            clean_tokens += [token.lemma_.lower() for token in doc            # using spacy nlp params, skip token if:
                            if (not remove_stopword or not token.is_stop)     # it is a stopword and remove_stopwords = True
                            and (not remove_punct or not token.is_punct)      # it is punctuation and remove_punct = True
                            and not token.lemma_.isspace()]                   # it is whitespace
        else:
            clean_tokens += [token.text.lower() for token in doc
                            if (not remove_stopword or not token.is_stop)
                            and (not remove_punct or not token.is_punct)
                            and not token.text.isspace()]

    return clean_tokens

def tokenize_spacy(docs, lemmatized=True, remove_stopword=True, remove_punct=True):
    """
       - `docs`: a list of documents (e.g. questions)
       - `lemmatized`: an optional boolean parameter to indicate if tokens are lemmatized. The default value is True (i.e. tokens are lemmatized).
       - `remove_stopword`: an optional bookean parameter to remove stop words. The default value is True (i.e. remove stop words).

   - Split each input document into unigrams and also clean up tokens as follows:
       - if `lemmatized` is turned on, lemmatize all unigrams.
       - if `remove_stopword` is set to True, remove all stop words.
       - if `remove_punct` is set to True, remove all punctuation tokens.
       - remove all empty tokens and lowercase all the tokens.
   - Return the list of tokens obtained for each document after all the processing.

    (Hint: you can use spacy package for this task. For reference, check https://spacy.io/api/token#attributes)
    """


    # load in spacy NLP model and disable unused pipelines to reduce processing time/memory space
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    nlp.add_pipe("sentencizer")
    # tokenize each doc in the corpus using specified params for lemmatization and removal conditions
    tokens = [tokenize_a_doc(doc, nlp, lemmatized, remove_stopword, remove_punct) for doc in docs]

    return tokens



In [None]:

text = """it's a Hello World!!!
           it is Hello World again."""

tokenize(text)


{"it's": 1.0, 'hello': 2.0, 'world': 2.0, 'it': 1.0, 'is': 1.0, 'again': 1.0}

In [None]:

data = pd.read_csv("qa.csv")
data.head()
# For simplicity, We will test on document

print(data["question"].iloc[0] + "\n")

print(f"1.lemmatized=True, remove_stopword=False, remove_punct = True:\n \
{tokenize_spacy(data['question'].iloc[0:1], lemmatized=True, remove_stopword=False, remove_punct = True)}\n")

print(f"2.lemmatized=True, remove_stopword=True, remove_punct = True:\n \
{tokenize_spacy(data['question'].iloc[0:1], lemmatized=True, remove_stopword=True, remove_punct = True)}\n")

print(f"3.lemmatized=False, remove_stopword=False, remove_punct = True:\n \
{tokenize_spacy(data['question'].iloc[0:1], lemmatized=False, remove_stopword=False, remove_punct = True)}\n")

print(f"4.lemmatized=False, remove_stopword=False, remove_punct = False:\n \
{tokenize_spacy(data['question'].iloc[0:1], lemmatized=False, remove_stopword=False, remove_punct = False)}\n")



Unnamed: 0,question,chatgpt_answer,human_answer
0,What happens if a parking ticket is lost / des...,If a parking ticket is lost or destroyed befor...,In my city you also get something by mail to t...
1,"why the waves do n't interfere ? first , I 'm ...",Interference is the phenomenon that occurs whe...,They do actually . That 's why a microwave ove...
2,Is it possible to influence a company's action...,"Yes, it is possible to influence a company's a...",Yes and no. This really should be taught at ju...
3,Why do taxpayers front the bill for sports sta...,Sports stadiums are usually built with public ...,That 's the bargaining chip that team owners u...
4,Why do clothing stores generally have a ton of...,There are a few reasons why clothing stores ma...,Your observation is almost certainly a matter ...


What happens if a parking ticket is lost / destroyed before the owner is aware of the ticket , and it goes unpaid ? I 've always been curious . Please explain like I'm five.

1.lemmatized=True, remove_stopword=False, remove_punct = True:
 [['what', 'happen', 'if', 'a', 'parking', 'ticket', 'be', 'lose', 'destroy', 'before', 'the', 'owner', 'be', 'aware', 'of', 'the', 'ticket', 'and', 'it', 'go', 'unpaid', 'i', 've', 'always', 'be', 'curious', 'please', 'explain', 'like', 'i', 'be', 'five']]

2.lemmatized=True, remove_stopword=True, remove_punct = True:
 [['happen', 'parking', 'ticket', 'lose', 'destroy', 'owner', 'aware', 'ticket', 'go', 'unpaid', 've', 'curious', 'explain', 'like']]

3.lemmatized=False, remove_stopword=False, remove_punct = True:
 [['what', 'happens', 'if', 'a', 'parking', 'ticket', 'is', 'lost', 'destroyed', 'before', 'the', 'owner', 'is', 'aware', 'of', 'the', 'ticket', 'and', 'it', 'goes', 'unpaid', 'i', 've', 'always', 'been', 'curious', 'please', 'explain', 'like

## Determine Sentiment

In [None]:

def sent(target, pos, neg):
    p = sum(1 for word in target if word in pos)
    n = sum(1 for word in target if word in neg)
    if p + n != 0:
        sentiment = (p - n) / (p + n)
    else:
        sentiment = 0
    return sentiment

def compute_sentiment(gen_tokens, ref_tokens, pos, neg):

    tokens = lambda token_list: [sent(sublist, pos, neg) for sublist in token_list]
    result = pd.DataFrame({'gen_sentiment': tokens(gen_tokens), 'ref_sentiment': tokens(ref_tokens)})

    avg = (result['gen_sentiment'] - result['ref_sentiment']).mean()

    res = wilcoxon(result['gen_sentiment'] - result['ref_sentiment'], alternative='greater')

    print(f"Average Sentiment: {avg}\n")
    print(f"Stat: {res.statistic}\nP-Value: {res.pvalue}\n")

    return result


In [None]:

pos_words = pd.read_csv("positive-words.txt", header = None)
pos_words.head()
pos = pos_words[0].values

neg_words = pd.read_csv("negative-words.txt", header = None)
neg_words.head()
neg = neg_words[0].values


Unnamed: 0,0
0,a+
1,abound
2,abounds
3,abundance
4,abundant


Unnamed: 0,0
0,2-faced
1,2-faces
2,abnormal
3,abolish
4,abominable


In [None]:

combos = [(True, False, False),
            (True, True, False),
            (True, False, True),
            (True, True, True),
            (False, False, True),
            (False, False, False)
        ]

for lemmatized, stopword, punct in combos:
    gen_tokens = tokenize_spacy(data["chatgpt_answer"], lemmatized=lemmatized, remove_stopword=stopword, remove_punct=punct)
    ref_tokens = tokenize_spacy(data["human_answer"], lemmatized=lemmatized, remove_stopword=stopword, remove_punct=punct)

    print(f"lemmatized={lemmatized}, remove_stopword={stopword}, remove_punct={punct}")

    result = compute_sentiment(gen_tokens, ref_tokens, pos, neg)
    result.describe()


lemmatized=True, remove_stopword=False, remove_punct=False
Average Sentiment: 0.13475198986800146

Stat: 10574.5
P-Value: 0.002158648693607304



Unnamed: 0,gen_sentiment,ref_sentiment
count,200.0,200.0
mean,0.187587,0.052835
std,0.587857,0.566326
min,-1.0,-1.0
25%,-0.272727,-0.297619
50%,0.142857,0.0
75%,0.666667,0.333333
max,1.0,1.0


lemmatized=True, remove_stopword=True, remove_punct=False
Average Sentiment: 0.14737597272496075

Stat: 10677.0
P-Value: 0.0008208850064502941



Unnamed: 0,gen_sentiment,ref_sentiment
count,200.0,200.0
mean,0.165322,0.017947
std,0.584721,0.567027
min,-1.0,-1.0
25%,-0.287815,-0.333333
50%,0.107955,0.0
75%,0.666667,0.333333
max,1.0,1.0


lemmatized=True, remove_stopword=False, remove_punct=True
Average Sentiment: 0.13475198986800146

Stat: 10574.5
P-Value: 0.002158648693607304



Unnamed: 0,gen_sentiment,ref_sentiment
count,200.0,200.0
mean,0.187587,0.052835
std,0.587857,0.566326
min,-1.0,-1.0
25%,-0.272727,-0.297619
50%,0.142857,0.0
75%,0.666667,0.333333
max,1.0,1.0


lemmatized=True, remove_stopword=True, remove_punct=True
Average Sentiment: 0.14737597272496075

Stat: 10677.0
P-Value: 0.0008208850064502941



Unnamed: 0,gen_sentiment,ref_sentiment
count,200.0,200.0
mean,0.165322,0.017947
std,0.584721,0.567027
min,-1.0,-1.0
25%,-0.287815,-0.333333
50%,0.107955,0.0
75%,0.666667,0.333333
max,1.0,1.0


lemmatized=False, remove_stopword=False, remove_punct=True
Average Sentiment: 0.1462586239453829

Stat: 10279.5
P-Value: 0.0011456573663914912



Unnamed: 0,gen_sentiment,ref_sentiment
count,200.0,200.0
mean,0.232919,0.08666
std,0.593347,0.563039
min,-1.0,-1.0
25%,-0.255682,-0.253289
50%,0.261364,0.0
75%,0.753676,0.446429
max,1.0,1.0


lemmatized=False, remove_stopword=False, remove_punct=False
Average Sentiment: 0.1462586239453829

Stat: 10279.5
P-Value: 0.0011456573663914912



Unnamed: 0,gen_sentiment,ref_sentiment
count,200.0,200.0
mean,0.232919,0.08666
std,0.593347,0.563039
min,-1.0,-1.0
25%,-0.255682,-0.253289
50%,0.261364,0.0
75%,0.753676,0.446429
max,1.0,1.0


#### In general, which tokenization configuration should be used? Why does this combination make the most sense?
The configuration that had the best results was tokenize(data, lemmatized=True, remove_stopword=True, remove_punct=True). This combination makes the most sense because lemmatization will help normalize the data by reducing the unique words in the dataset while still preserving semantic meaning of a given document in the corpus. Removing stop words and punctuation will also improve tokenization results since these types of characters don't add to the semantic meaning of the text. Stopwords and punctuation also tend to have a high frequency within a text, so by removing this noise it is much easier to extract the desired text characteristics.

#### Do you think, overall, ChatGPT-generated answers are more posive or negative than human-generated ones? Use data to support your conclusion.

As seen in the statistics from result.describe(), ChatGPT does have a lower mean value for sentiment, with the average human answer (0.233) around 3x more positive than ChatGPT (0.086). The 50th percentile of ChatGPT answers have a sentiment score of 0 or lower, while the human generated answers 50th percentile is 0.26. While they have similar average scores for the 25th percentile of responses, the human responses are clearly much more positive on average, a trend that continues for the 75th percentile with human responses scoring around 30% higher sentiment.

## Generate Vocabulary

In [None]:

def generate_vocab(sents: List[str]):
    """Tokenizes a list of sentences (corpus) to build a frequency dictionary. This
    is the vocabulary for the entire corpus, sorted by word count in descending order.

    :param list[str] sents: list of sentences (corpus)

    :return dict: corpus vocabulary
    """

    all_vocab = {}

    # Iterate through series to get count dict for each sentence
    for s in sents:
        vocab_ct = tokenize(s)

        # Update dict of all words based on the words and their count per sentence
        for v in vocab_ct:
            if v not in all_vocab:
                all_vocab[v] = vocab_ct[v]
            else:
                all_vocab[v] = all_vocab[v] + vocab_ct[v]

    # convert key/val pairs to item tuples, reverse sort by word count, then convert back to dictionary
    all_vocab = dict(
                    sorted(all_vocab.items(), key = lambda v: v[1], reverse=True)
                    )

    return all_vocab


In [None]:
# A test document. This document can be found at https://hbr.org/2022/04/the-power-of-natural-language-processing
sents = pd.read_csv("/content/sents.csv")
sents.head()

generate_vocab(sents)


## Document term matrix (DTM)

In [None]:
def get_dtm(sents):
    """
    - accepts a list of sentences, i.e., `sents`, as an input
    - call `tokenize` function you defined in Q1 to get the count dictionary for each sentence, and combine them into a list
    - call `generate_vocab` function in Q2 to generate the large vocabulary for all sentences, and get all the words, i.e., keys
    - creates a numpy array, say `dtm` with a shape (# of docs x # of unique words), and set the initial values to 0.
    - fills cell `dtm[i,j]` with the count of the `j`th word in the `i`th sentence. HINT: you can loop through the list of vocabulary from step 2, and check each word's index in the large vocabulary from step 3, so that you can put the corresponding value into the correct cell.
    - returns `dtm` and `unique_words`
    """

    all_docs = [tokenize(s) for s in sents]

    all_words = list(generate_vocab(sents).keys())

    m,n = len(all_docs), len(all_words)
    dtm = np.zeros((m,n))

    for doc in range(m):
        for i,word in enumerate(all_words):
            if word in all_docs[doc]:
                dtm[doc,i] = all_docs[doc][word]

    return dtm, all_words

def analyze_dtm(dtm, words, sents):
    """
    * takes an array $dtm$ and $words$ as an input, where $dtm$ is the array you get in Q3 with a shape $(m \times n)$, and $words$ contains an array of words corresponding to the columns of $dtm$.
    * calculates the sentence frequency for each word, say $j$, e.g. how many sentences contain word $j$. Save the result to array $df$ ($df$ has shape of $(n,)$ or $(1, n)$).
    * normalizes the word count per sentence: divides word count, i.e., $dtm_{i,j}$, by the total number of words in sentence $i$. Save the result as an array named $tf$ ($tf$ has shape of $(m,n)$).
    * for each $dtm_{i,j}$, calculates $tf\_idf_{i,j} = \frac{tf_{i, j}}{df_j}$, i.e., divide each normalized word count by the sentence frequency of the word. The reason is, if a word appears in most sentences, it does not have the discriminative power and often is called a `stop` word. The inverse of $df$ can downgrade the weight of such words. $tf\_idf$ has shape of $(m,n)$
    * prints out the following:

        - the total number of words in the document represented by $dtm$
        - the most frequent top 10 words in this document, compare with the results from Q2, and briefly explain the difference
        - words with the top 10 largest $df$ values (show words and their $df$ values)
        - the longest sentence (i.e., the one with the most words)
        - top-10 words with the largest $tf\_idf$ values in the longest sentence (show words and values)
    * returns the $tf\_idf$ array.



    Note, for all the steps, **do not use any loop**. Just use array functions and broadcasting for high performance computation.
    """


    df = np.count_nonzero(dtm, axis=0)

    total = dtm.sum(axis=1)[:, np.newaxis]
    tf = dtm/total

    tfidf = tf/df[np.newaxis,:]

    n_top = 10

    words_freq = dtm.sum(axis=0)
    words_most = words_freq.argsort()[::-1][:n_top]
    top_words = list(zip(words[words_most], words_freq[words_most]))

    hi_df = df.argsort()[::-1][:n_top]
    top_df = list(zip(words[hi_df], df[hi_df]))

    longest = dtm.sum(axis=1).argmax()

    longest_tfidf = tfidf[longest].argsort()[::-1][:n_top]
    top_tfidf = list(zip(words[longest_tfidf], tfidf[longest][longest_tfidf]))

    print(f'The total number of words:\n{dtm.sum()}\n')
    print(f'The top 10 frequent words:\n{top_words}\n')
    print(f'The top 10 words with highest df values:\n{top_df}\n')
    print(f'The longest sentence:\n{sents[longest]}\n')
    print(f'The top 10 words with highest tf-idf values in the longest sentence:\n{top_tfidf}')

    return tfidf


In [None]:

dtm, all_words = get_dtm(sents.text)

# randomly check one sentence
idx = 3

# get the dictionary using the tokenizer
vocab = tokenize(sents["text"].loc[idx])

# get all non-zero entries in dtm[idx] and create a dictionary
vocab_dtm ={all_words[j]: dtm[idx][j] for j in np.where(dtm[idx]>0)[0]}

a = sorted(vocab.items(), key = lambda item: item[0])
b = sorted(vocab_dtm.items(), key = lambda item: item[0])
# Check if the array is correct
assert a == b, "Dicts don't match!"

sents.loc[idx]

print(a)
print(b)

# analyze dtm array
analyze_dtm(dtm, np.array(all_words), sents.text)


## Performance Evaluation (Precision and Recall)

In [None]:

def evaluate_performance(prob, truth, th):
    """
    - Given a threshold, say $th$, if a probability > $th$, the prediction is positive; otherwise, negative
    - Compare the prediction with the ground truth labels to calculate the confusion matrix as [[TN, FN],[FP,TP]], where:
        * True Positives (TP): the number of correct positive predictions
        * False Positives (FP): the number of postive predictives which actually are negatives
        * True Negatives (TN): the number of correct negative predictions
        * False Negatives (FN): the number of negative predictives which actually are positives
    - Calculate **precision** as $TP/(TP+FP)$ and **recall** as $TP/(TP+FN)$
    - return precision and recall.

    """
    conf = [[0, 0], [0, 0]]

    classifiers = list(zip(prob, truth))

    for p,t in classifiers:

        guess = 0

        if p > th:
            guess = 1

        if guess == t:
                if guess == 0:
                    conf[0][0] += 1
                else:
                    conf[1][1] += 1

        else:
            if guess == 0:
                conf[0][1] += 1
            else:
                conf[1][0] += 1

    [[TN,FN],[FP,TP]] = conf
    prec = TP/(TP+FP)
    rec = TP/(TP+FN)

    performance = {
                    "R0": prec,
                    "R1": rec
                    }

    return performance

def bigram_precision_recall(gen_tokens, ref_tokens):
    result = pd.DataFrame(columns = ['overlapping','precision','recall'])

    gen_bigrams = [list(nltk.bigrams(tokens)) for tokens in gen_tokens]
    ref_bigrams = [list(nltk.bigrams(tokens)) for tokens in ref_tokens]

    bigrams = list(zip(gen_bigrams, ref_bigrams))

    overlapping = []
    precision = []
    recall = []
    for gen, ref in bigrams:
        overlap = [tup1 for tup1 in gen for tup2 in ref if tup1 == tup2]
        overlapping.append(list(set(overlap)))

        if gen:
            precision.append(len(overlap)/len(gen))
        else:
            precision.append(0)

        if ref:
            recall.append(len(overlap)/len(ref))
        else:
            recall.append(0)

    result['overlapping'] = overlapping
    result['precision'] = precision
    result['recall'] = recall

    return result



In [None]:

result = bigram_precision_recall(gen_tokens,
                                 ref_tokens)
result.head()

result[["precision", "recall"]].mean(axis = 0)


In [None]:

combos = [(True, False, False),
            (True, True, False),
            (True, False, True),
            (True, True, True),
            (False, False, True),
            (False, False, False)
        ]

for lemmatized, stopword, punct in combos:
    gen_tokens = tokenize_spacy(data["chatgpt_answer"], lemmatized=lemmatized, remove_stopword=stopword, remove_punct=punct)
    ref_tokens = tokenize_spacy(data["human_answer"], lemmatized=lemmatized, remove_stopword=stopword, remove_punct=punct)

    print(f"lemmatized={lemmatized}, remove_stopword={stopword}, remove_punct={punct}")

    result = bigram_precision_recall(gen_tokens, ref_tokens)
    result[["precision", "recall"]].mean(axis = 0)



**Do you think, in general, which tokenization configuration should be used? Why does this combination make the most sense?**
The best configuration in this case was tokenize(data, lemmatized=True, remove_stopword=False, remove_punct=True). This combination makes sense because stopwords can provide context to certain words and their meaning or connections to other unique tokens, as well as identify phrases or make it easier to pair up tokens by pairing unique words and stopwords in a bigram.

**Do you think, overall, ChatGPT is able to mimic human in answering these questions?**
I think ChatGPT is not mimicing humans very well, as it still has low precision and recall scores overall, regardless of the tokenization configuration. ChatGPT tends to repeat a lot of the words from the question, or rephrase the same information with some new text. The human answers have more variability in the text and more of a unique voice than the ones generated by ChatGPT.



In [None]:

prob =np.array([0.28997326, 0.10166073, 0.10759583, 0.0694934 , 0.6767239 ,
       0.01446897, 0.15268748, 0.15570522, 0.12159665, 0.22593857,
       0.98162019, 0.47418329, 0.09376987, 0.80440782, 0.88361167,
       0.21579844, 0.72343069, 0.06605903, 0.15447797, 0.10967575,
       0.93020135, 0.06570391, 0.05283854, 0.09668829, 0.05974545,
       0.04874688, 0.07562255, 0.11103822, 0.71674525, 0.08507381,
       0.630128  , 0.16447478, 0.16914903, 0.1715767 , 0.08040751,
       0.7001173 , 0.04428363, 0.19469664, 0.12247959, 0.14000294,
       0.02411263, 0.26276603, 0.11377073, 0.07055441, 0.2021157 ,
       0.11636899, 0.90348488, 0.10191679, 0.88744523, 0.18938904])

truth = np.array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0])



# Test with threhold grid varying from 0.05 to 0.95 with an increase of 0.05
th_list = np.arange(0.05, 1.00, 0.05)

vals = [tuple(evaluate_performance(prob, truth, th).values()) for th in th_list]
prec, rec = zip(*vals)
vals_df = pd.DataFrame(vals, columns = ['prec', 'rec'], index = th_list)

print(f"Precision/Recall\n\nOne Value Only:\n{json.dumps(evaluate_performance(prob, truth, 0.05), indent=2)}\n")
print(f"All Values:")
vals_df


In [None]:

plt.plot(th_list, vals_df, label = ['prec','rec'])

plt.legend()
plt.title('Precision and Recall vs. Threshold')
plt.xlabel('Threshold')
plt.ylabel('Precision and Recall')
plt.show()
