### 0. Initizalize Packages

In [1]:
#!pip install scipy,seaborn
import seaborn as sns
from scipy.stats import kstest

import pandas as pd
import numpy as np

#!pip install -U plotly
pd.options.plotting.backend = "plotly"

#!pip3 install nltk
import nltk
from nltk import pos_tag
from nltk import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from scipy.stats import chi2_contingency

import math
from scipy.stats import shapiro 
from scipy.stats import lognorm
from scipy.stats import mannwhitneyu
import matplotlib.pyplot as plt

#!pip3 install openpyxl
import openpyxl as px
from statsmodels.stats.weightstats import ztest



### 0.1. Read Dataset ( N = 114)

In [312]:
df = pd.read_excel('all_gpt_human_only_text.xlsx')

### 0.1.1 Preprocess

Get rid of extra white spaces

In [313]:
def strip(dataframe):
    return [s.strip() for s in dataframe]

df= df.apply(strip)

Each story's pos taggings completed separately, and counted for the story itself

### 1.1. Define Functions

In [314]:
punc_list = '''(),-[]{};*:'"\,<>/@_~ '''
punc_list = [*punc_list]

def map_item_to_category(item):
    if item in ['NN', 'NNS', 'NNP', 'NNPS']:
        return 'NN'
    elif item in ['JJ', 'JJS', 'JJR']:
        return 'JJ'
    elif item in ['RB', 'RBR', 'WRB', 'RBS']:
        return 'RB'
    elif item in ['PRP', 'PRP$', 'WP', 'WP$']:
        return 'PRP'
    elif item in punc_list:
        return ''
    else:
        return item

def sentence_splitter(mylist):   #separately take sentences which seperated with '.'
    # Create an empty list to store the separate lists
    result = []
    # Create a temporary list to store the chunks
    temp = []

    # Loop through each element in the list
    for item in mylist:
        if item not in [*punc_list]: 
            # Check if the element is a period
            if item == ('.' or '?' or '!'):
                # If it is, add the temporary list to the result list
                result.append(temp)
                # Reset the temporary list
                temp = []
            else:
                # If it's not a period, append the element to the temporary list
                temp.append(item)

    # Add the last temporary list to the result list, in case the list ends without a period
    if temp:
        result.append(temp)
    return(result)

def sentence_dna(taglist):
    result = []
    for sublist in taglist:
        # Use join() to concatenate the elements in each sublist into a single string
            string = ''.join(sublist)
            result.append(string)
    return result

In [315]:
def get_split_sentence_dnas(text):
    tags = []
    tokenized = word_tokenize(text)
    'tag the tokens'
    tagged = nltk.pos_tag(tokenized)
    for i in tagged:
        tags.append(i[1])
    for i in range(len(tags)):
        tags[i] = map_item_to_category(tags[i]) # makes tags generalized e.g., NN, NNS, NNP become NN 
    
    splitted_sentences = sentence_splitter(tags)        
    dna = get_sentence_dna(splitted_sentences)
    return dna

In [316]:
## Apply the sentence splitter function to each row in some columns
for column in df.columns:
    df[f'{column}_dna'] = df[column].apply(lambda x: get_split_sentence_dnas(x))

### 1.2. Get Most Frequent Sentence Structure

In [317]:
import pandas as pd

def most_freq_sentence_dna(df,column):
    # Create an empty dictionary to store word counts
    word_counts = {}

    # Iterate through each row in the specified column
    for index, row in df.iterrows():
        # Get the value in the current row of the specified column
        text = row[column]

        # Split the text into words using space as a delimiter
        words = [i for i in text]

        # Iterate through each word in the list
        for word in words:
            # Skip empty strings
            if word == '':
                continue

            # If the word is not in the dictionary, add it with a count of 1
            if word not in word_counts:
                word_counts[word] = 1
            # If the word is already in the dictionary, increment its count by 1
            else:
                word_counts[word] += 1

    # Convert the dictionary to a list of tuples (word, count)
    result = list(word_counts.items())

    # Sort the list based on word counts in descending order
    result.sort(key=lambda x: x[1], reverse=True)

    # Print the list of word counts
    return result[0:3] # print most freq first three


In [318]:
cols = ['STORY_dna', 'GPT_R1_dna', 'GPT_R2_dna', 'GPT_R3_dna',
       'Retell_1_dna', 'Retell_2_dna', 'Retell_3_dna']

In [319]:
for col in cols:
    print(col, most_freq_sentence_dna(df,col))

STORY_dna [("''PRPVBD", 7), ('NN', 5), ('PRPVBDDTJJNN', 4)]
GPT_R1_dna [('DTJJNNNNVBDVBGNNCCVBDDTNNNNINCDINPRPJJNNNNCCNN', 1), ('PRPVBDJJTOVBPRPINDTJJNNCCVBDDTNN', 1), ('RBPRPVBDDTNNINPRPVBGPRPTOPRPNNNN', 1)]
GPT_R2_dna [('DTJJNNRBVBDDTNNNNINNNINCDJJNNNNCCNN', 1), ('PRPVBDVBNCCJJTOVBPRPINDTJJNNCCVBDDTNN', 1), ('PRPRBVBDPRPCCVBDPRPTOPRPNNNNWDTPRPVBDCCVBDNNINPRP', 1)]
GPT_R3_dna [('DTJJNNRBVBDDTNNNNINNNINCDJJNNNNCCNN', 1), ('PRPVBDPRPTOPRPNNNNWDTPRPVBDCCVBDNNINPRP', 1), ('DTNNVBDRPNNNNCCNNINDTNN', 1)]
Retell_1_dna [('PRPVBDDTJJNN', 3), ('PRPVBDNNNN', 2), ("''", 2)]
Retell_2_dna [('NNVBDJJ', 2), ('NN', 2), ('VB', 2)]
Retell_3_dna [('NN', 3), ('PRPVBDDTJJNN', 2), ('DTJJNNNNVBDVBGINNNCDNNRBPRPVBDCDNNNNINCDJJNN', 1)]
