In [1]:
import pandas as pd


In [2]:
df = pd.read_json("quotes-2019-nytimes.json", lines = True) # change to correct file path


In [3]:
sample_df = df[['quoteID', 'quotation']].head(50) # only for quick testing

In [4]:
# creating dictonary that maps each pronoun to the correct category
pronouns = {
    'me' :'self',
    'myself' : 'self',
    'ours' : 'union',
    'ourselves' : 'union',
    'us' : 'union',
    'hers' : 'other',
    'herself' : 'other',
    'him' : 'other',
    'himself' : 'other',
    'hisself' : 'other',
    'one' : 'other',
    'oneself' : 'other',
    'she' : 'other',
    'thee' : 'other',
    'theirs' : 'other',
    'them' : 'other',
    'themselves' : 'other',
    'they' : 'other',
    'thou' : 'other',
    'thy' : 'other',
    'mine' : 'self',
    'my' : 'self',
    'our' : 'union',
    'ours' : 'union',
    'her' : 'other',
    'his' : 'other',
    'their' : 'other',
    'they' : 'other', 
    'your' : 'other',
    'we' : 'union',
    'i': 'self',
    'he': 'other',
    'she': 'other',
    'you': 'other'
}


In [94]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

def add_sentiment_cols(row):
    '''
    Adds the sentiment column to the row.
    This column is based on NLTK's polarity score
    -1 one means 100 % negative, 1 100 % positive
    :param row: dataframe row to add column to
    :return: row with specified column added
    '''
    sentiments = sid.polarity_scores(row['quotation'])
    row['sentiment'] = sentiments['pos'] - sentiments['neg']
    return row

def add_morphological_cols(row):
    '''
    Adds the following columns to each row:

    self: count of pronouns referring to the speaker (ex: me, I)
    union: count of pronouns referring to the speaker and its group (ex: we, our)
    other: count of pronouns referring to someone else than the speaker (ex: he, their)
    adj_count: count of adjectives in the quote
    adj_ratio: ratio of regular adjectives in comparison to superlatives and comparatives

    -1 means only comparatives and superlatives, 1 only regular adjectives
    :param row: dataframe row to add columns to
    :return: row with specified columns added
    '''
    word_tag_pairs = nltk.pos_tag(nltk.word_tokenize(row['quotation']))
    adj_count = 0
    regular_adj_count = 0
    # Count necessary tags in the quote of the row
    for word, tag in word_tag_pairs:
        if tag == 'PRP' or tag == 'PRPS':
            if word[:2].lower() != 'it': 
                row[pronouns[word.lower()]] += 1
        elif tag[:2] == 'JJ':
            adj_count += 1
            if tag == 'JJ':
                regular_adj_count += 1
    row['adj_count'] = adj_count
    if adj_count == 0:
        row['adj_ratio'] = 0
    else:
        row['adj_ratio'] = 2 * ((regular_adj_count / adj_count) - (1 / 2))

    return row

def create_dataframe_with_features(df):
    '''
    Adds the following features to the dataframe:
    
    self: count of pronouns referring to the speaker (ex: I, my)
    union: count of pronouns referring to the speaker and its group (ex: we, our)
    other: count of pronouns referring to someone else than the speaker (ex: he, her)
    adj_count: count of adjectives in the quote
    adj_ratio: ratio of regular adjectives in comparison to superlatives and comparatives
    -1 means only comparatives and superlatives, 1 only regular adjectives
    sentiment: based on NLTK's polarity score, -1 one means 100 % negative, 1 100 % positive
    
    :param row: dataframe row to add columns to
    :return: row with specified columns added
    :param df: dataframe to add features to
    :return new_df: new dataframe with added features
    '''
    int_cols = ['self', 'union', 'other', 'adj_count']
    float_cols = ['adj_ratio', 'sentiment']
    new_df = df.copy()

    for col in int_cols:
        new_df[col] = 0

    new_df = new_df.apply(add_morphological_cols, axis = 1)
    new_df = new_df.apply(add_sentiment_cols, axis = 1)

    # making the size of the dataframe smaller by setting integers with clear upperbount to unsigned int8
    for col in float_cols:
        new_df[col] = new_df[col].astype('float16')
    for col in int_cols:
        new_df[col] = new_df[col].astype('int8')

    return new_df

    

  

In [95]:
new_df = create_dataframe_with_features(sample_df)
new_df

Unnamed: 0,quoteID,quotation,self,union,other,adj_count,adj_ratio,sentiment
0,2019-04-17-024782,"It is not a low-income immigration,",0,0,0,1,1.0,0.0
1,2019-04-02-001128,a champion figure skater switching to roller s...,0,0,0,0,0.0,0.394043
2,2019-05-09-055187,It makes it much more difficult for him to mak...,0,0,1,1,1.0,-0.188965
3,2019-10-31-056366,"It puts me in a predicament,",1,0,0,0,0.0,0.0
4,2019-01-04-001792,A Pile of Leaves.,0,0,0,0,0.0,0.0
5,2019-08-15-002017,A Senator we can call our own.,0,1,0,1,1.0,0.0
6,2019-02-12-002452,"a thorn in the side of cabinet,",0,0,0,0,0.0,0.0
7,2019-01-28-048557,"It was just on a lark,",0,0,0,0,0.0,0.0
8,2019-07-22-032883,"It's a success, a relief and a technical feat,",0,0,0,1,1.0,0.629883
9,2019-12-31-002666,always appreciates and respects the work and a...,0,0,0,0,0.0,0.48291
