### 0. Upload necessary packages

In [3]:
import pandas as pd
import numpy as np


#!pip3 install nltk
import nltk
from nltk import pos_tag
from nltk import word_tokenize
from nltk.corpus import stopwords
from collections import Counter


### 0.1. Read Dataset ( N = 114)

In [4]:
df = pd.read_excel('all_gpt_human_only_text.xlsx')

### 0.1.1 Preprocess

Get rid of extra white spaces in the texts

In [5]:
def strip(dataframe):
    return [s.strip() for s in dataframe]

df= df.apply(strip)

## 1. Pos Tagging with Less Category Separate for Each Sentence

Each story's pos taggings completed separately, and counted for the story itself

### 1.1. Preprocess and Tag

In [6]:
#clean data from stopwords, tokenize it, and tag
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
stop_words = set(stopwords.words('english'))

def create_tokenized(text):
    tokenized = word_tokenize(text)
    'tag the tokens'
    tagged = nltk.pos_tag(tokenized)
    return tagged

In [7]:
## Filter and get words as tokens into a new column 
for column in df.columns:
    df[f'tokenized_{column}'] = df[column].apply(lambda x: create_tokenized(x))

### 1.2. Count the pos-tags

In [10]:
#count the tags
def count_tags(mydata):
    counter_list=[]
    for i in mydata:
        counter_list.append(i[1])

    tag_counts = Counter(counter_list) 
    return tag_counts

In [11]:
for column in [col for col in df.columns if col.startswith('tokenized_')]:
    df[f'{column}_tag_counts'] = df[column].apply(lambda x: count_tags(x))

### 1.3. Merge relevant categories together

Count tags for all nouns (plural nouns, proper nouns etc.), verbs, adjectives, adverbs, pronouns and prepositions/conjuctions/determiners


In [13]:
for column in [col for col in df.columns if col.endswith('tag_counts')]:
    df[f'{column[:-7]}_noun'] = [*(df[column].apply(lambda x: (x['NN'] + x['NNS'] + x['NNP']+ x['NNPS'])))]
    df[f'{column[:-7]}_verb'] = [*(df[column].apply(lambda x: (x['VB'] + x['VBN'] + x['VBG']+ x['VBZ'] + x['VBP']+x['VBD'])))]
    df[f'{column[:-7]}_adj'] = [*(df[column].apply(lambda x: (x['JJ'] + x['JJS'] + x['JJR'])))]
    df[f'{column[:-7]}_adv'] = [*(df[column].apply(lambda x: (x['RB'] + x['RBR'] + x['WRB'] +x['RBS'])))]
    df[f'{column[:-7]}_pron'] = [*(df[column].apply(lambda x: (x['PRP'] + x['PRP$'] + x['WB']+ x['WB$'])))]
    df[f'{column[:-7]}_con_det_prep'] = [*(df[column].apply(lambda x: (x['DT'] + x['IN'] + x['UH']+ x['TO']+ x['WDT']+ x['EX'])))]
    df[f'{column[:-7]}_prep'] = [*(df[column].apply(lambda x: (x['IN'])))]



In [90]:
# Save it to the folder
df.to_excel('df_counts.xlsx')

### 2. Count Negations Only

In [16]:
df = pd.read_excel('all_gpt_human_only_text.xlsx')

In [17]:
def tokenize_with_negation_only(text):
    tokenized_dict = {}
    tokenized = word_tokenize(text)
    tagged = nltk.pos_tag(tokenized)

    to_remove = set()
    for i in range(len(tagged)):
        if tagged[i][1] in {'RB', 'RBS', 'WRB', 'RBR'} and tagged[i][0].lower() in {'not', "n't"}:
            pass
        elif tagged[i][1] in {'RB', 'RBS', 'WRB','RBR'} and tagged[i][0].lower() not in {'not', "n't"}:
            to_remove.add(i)

        new_tagged = [tagged[i] for i in range(len(tagged)) if i not in to_remove]        

    return new_tagged

In [18]:
## Filter and get words as tokens
for column in df.columns:
    df[f'tokenized_{column}'] = df[column].apply(lambda x: tokenize_with_negation_only(x))

### 2.2. Count the pos-tags

In [19]:
#count the tags
def count_tags(mydata):
    counter_list=[]
    for i in mydata:
        counter_list.append(i[1])

    tag_counts = Counter(counter_list) 
    return tag_counts

In [20]:
for column in [col for col in df.columns if col.startswith('tokenized_')]:
    df[f'{column}_tag_counts'] = df[column].apply(lambda x: count_tags(x))

In [21]:
df.tokenized_STORY_tag_counts

0      {'DT': 16, 'NN': 23, 'WP': 3, 'VBD': 11, 'IN':...
1      {'NNP': 4, 'VBD': 14, 'VBG': 7, 'JJ': 9, 'CD':...
2      {'DT': 20, 'NNP': 3, 'VBD': 17, 'TO': 7, 'VB':...
3      {'DT': 14, 'IN': 14, 'PRP$': 2, 'NN': 22, ',':...
4      {'VBP': 7, 'PRP': 26, 'VBD': 18, 'DT': 11, 'NN...
                             ...                        
111    {'NNP': 6, 'VBZ': 4, 'PRP$': 6, 'NN': 29, '``'...
112    {'EX': 1, 'VBD': 19, 'DT': 31, 'NN': 36, 'WP':...
113    {'EX': 1, 'VBD': 18, 'DT': 20, 'NN': 32, 'WP':...
114    {'CD': 1, 'NN': 30, 'DT': 18, 'VBD': 12, 'VBG'...
115    {'DT': 15, 'NN': 24, 'VBD': 12, 'IN': 16, 'JJ'...
Name: tokenized_STORY_tag_counts, Length: 116, dtype: object

### 2.3. Merge relevant categories together

Count tags for all nouns (plural nouns, proper nouns etc.), verbs, adjectives, adverbs, pronouns and prepositions/conjuctions/determiners


In [22]:
for column in [col for col in df.columns if col.endswith('tag_counts')]:
    df[f'{column[:-7]}_noun'] = [*(df[column].apply(lambda x: (x['NN'] + x['NNS'] + x['NNP']+ x['NNPS'])))]
    df[f'{column[:-7]}_verb'] = [*(df[column].apply(lambda x: (x['VB'] + x['VBN'] + x['VBG']+ x['VBZ'] + x['VBP']+x['VBD'])))]
    df[f'{column[:-7]}_adj'] = [*(df[column].apply(lambda x: (x['JJ'] + x['JJS'] + x['JJR'])))]
    df[f'{column[:-7]}_adv'] = [*(df[column].apply(lambda x: (x['RB'] + x['RBR'] + x['WRB'] + + x['RBS'])))]
    df[f'{column[:-7]}_pron'] = [*(df[column].apply(lambda x: (x['PRP'] + x['PRP$'] + x['WB']+ x['WB$'])))]
    df[f'{column[:-7]}_con_det_prep'] = [*(df[column].apply(lambda x: (x['DT'] + x['IN'] + x['UH']+ x['TO']+ x['WDT']+ x['EX'])))]
    df[f'{column[:-7]}_prep'] = [*(df[column].apply(lambda x: (x['IN'])))]



In [23]:
df.to_excel('df_negations.xlsx')

In [38]:
df['STORY']

0      A school teacher who taught for high school Ki...
1      Abigail was feeling sad one day so she started...
2      All Jennifer wanted was to go to the punk rock...
3      All of his life, Jeff was something of an unde...
4      Have you ever had a child?  Do you plan to?  I...
                             ...                        
111    Michael asks his dad "Dad, how much money do y...
112    Once there was a man who worked for the fire d...
113    There was a boy who always wanted a pet goldfi...
114    One day a girl was walking her dog when a squi...
115    A troll lived alone in a very large cave deep ...
Name: STORY, Length: 116, dtype: object

## Now, you have both negation counts (df_negations.xlsx), and part-of-speech tagging (grammatical categories) counts (df_counts.xlsx). 

You can manually gather storyID, and emotion categories of the stories from the master file(OSF Project page --> Affect Preservation--> Master.Affect2.Generate.Stories.Study.ChatGPT.Final.csv) and change the format of data from short to long for conducting analyses at R. 

You can also use the final file we added storyID, emotion variables and modify it to long format, it is named as 'df_counts_emotions.xlsx' in OSF Project page --> Word Count and Part of Speechs --> 'partOfSpeech_StoryIDs_Emotions.xlsx'

Also please see the below information for the explanation of the tags.

### Tags:

CC: It is the conjunction of coordinating

CD: It is a digit of cardinal

DT: It is the determiner

EX: Existential

FW: It is a foreign word

IN: Preposition and conjunction

JJ: Adjective

JJR and JJS: Adjective and superlative

LS: List marker

MD: Modal

NN: Singular noun

NNS, NNP, NNPS: Proper and plural noun

PDT: Predeterminer

WRB: Adverb of wh

WP$: Possessive wh

WP: Pronoun of wh

WDT: Determiner of wp
VBZ: Verb

VBP, VBN, VBG, VBD, VB: Forms of verbs

UH: Interjection


TO: To go

RP: Particle

RBS, RB, RBR: Adverb

PRP, PRP$: Pronoun personal and professional