In [1]:
import pandas as pd


In [2]:
df = pd.read_json("quotes-2019-nytimes.json", lines = True) # change to correct file path


In [3]:
sample_df = df[['quoteID', 'quotation']].head(50) # only for quick testing

In [4]:
# creating dictonary that maps each pronoun to the correct category
pronouns = {
    'me' :'self',
    'myself' : 'self',
    'ours' : 'union',
    'ourselves' : 'union',
    'us' : 'union',
    'hers' : 'other',
    'herself' : 'other',
    'him' : 'other',
    'himself' : 'other',
    'hisself' : 'other',
    'one' : 'other',
    'oneself' : 'other',
    'she' : 'other',
    'thee' : 'other',
    'theirs' : 'other',
    'them' : 'other',
    'themselves' : 'other',
    'they' : 'other',
    'thou' : 'other',
    'thy' : 'other',
    'mine' : 'self',
    'my' : 'self',
    'our' : 'union',
    'ours' : 'union',
    'her' : 'other',
    'his' : 'other',
    'their' : 'other',
    'they' : 'other', 
    'your' : 'other',
    'we' : 'union',
    'i': 'self',
    'he': 'other',
    'she': 'other',
    'you': 'other'
}


In [94]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

def add_sentiment_cols(row):
    '''
    Adds the sentiment column to the row.
    This column is based on NLTK's polarity score
    -1 one means 100 % negative, 1 100 % positive
    :param row: dataframe row to add column to
    :return: row with specified column added
    '''
    sentiments = sid.polarity_scores(row['quotation'])
    row['sentiment'] = sentiments['pos'] - sentiments['neg']
    return row

def add_morphological_cols(row):
    '''
    Adds the following columns to each row:

    self: count of pronouns referring to the speaker (ex: me, I)
    union: count of pronouns referring to the speaker and its group (ex: we, our)
    other: count of pronouns referring to someone else than the speaker (ex: he, their)
    adj_count: count of adjectives in the quote
    adj_ratio: ratio of regular adjectives in comparison to superlatives and comparatives

    -1 means only comparatives and superlatives, 1 only regular adjectives
    :param row: dataframe row to add columns to
    :return: row with specified columns added
    '''
    word_tag_pairs = nltk.pos_tag(nltk.word_tokenize(row['quotation']))
    adj_count = 0
    regular_adj_count = 0
    # Count necessary tags in the quote of the row
    for word, tag in word_tag_pairs:
        if tag == 'PRP' or tag == 'PRPS':
            if word[:2].lower() != 'it': 
                row[pronouns[word.lower()]] += 1
        elif tag[:2] == 'JJ':
            adj_count += 1
            if tag == 'JJ':
                regular_adj_count += 1
    row['adj_count'] = adj_count
    if adj_count == 0:
        row['adj_ratio'] = 0
    else:
        row['adj_ratio'] = 2 * ((regular_adj_count / adj_count) - (1 / 2))

    return row

def create_dataframe_with_features(df):
    '''
    Adds the following features to the dataframe:
    
    self: count of pronouns referring to the speaker (ex: I, my)
    union: count of pronouns referring to the speaker and its group (ex: we, our)
    other: count of pronouns referring to someone else than the speaker (ex: he, her)
    adj_count: count of adjectives in the quote
    adj_ratio: ratio of regular adjectives in comparison to superlatives and comparatives
    -1 means only comparatives and superlatives, 1 only regular adjectives
    sentiment: based on NLTK's polarity score, -1 one means 100 % negative, 1 100 % positive
    
    :param row: dataframe row to add columns to
    :return: row with specified columns added
    :param df: dataframe to add features to
    :return new_df: new dataframe with added features
    '''
    int_cols = ['self', 'union', 'other', 'adj_count']
    float_cols = ['adj_ratio', 'sentiment']
    new_df = df.copy()

    for col in int_cols:
        new_df[col] = 0

    new_df = new_df.apply(add_morphological_cols, axis = 1)
    new_df = new_df.apply(add_sentiment_cols, axis = 1)

    # making the size of the dataframe smaller by setting integers with clear upperbount to unsigned int8
    for col in float_cols:
        new_df[col] = new_df[col].astype('float16')
    for col in int_cols:
        new_df[col] = new_df[col].astype('int8')

    return new_df

    

  

In [95]:
new_df = create_dataframe_with_features(sample_df)
new_df

Unnamed: 0,quoteID,quotation,self,union,other,adj_count,adj_ratio,sentiment
0,2019-04-17-024782,"It is not a low-income immigration,",0,0,0,1,1.0,0.0
1,2019-04-02-001128,a champion figure skater switching to roller s...,0,0,0,0,0.0,0.394043
2,2019-05-09-055187,It makes it much more difficult for him to mak...,0,0,1,1,1.0,-0.188965
3,2019-10-31-056366,"It puts me in a predicament,",1,0,0,0,0.0,0.0
4,2019-01-04-001792,A Pile of Leaves.,0,0,0,0,0.0,0.0
5,2019-08-15-002017,A Senator we can call our own.,0,1,0,1,1.0,0.0
6,2019-02-12-002452,"a thorn in the side of cabinet,",0,0,0,0,0.0,0.0
7,2019-01-28-048557,"It was just on a lark,",0,0,0,0,0.0,0.0
8,2019-07-22-032883,"It's a success, a relief and a technical feat,",0,0,0,1,1.0,0.629883
9,2019-12-31-002666,always appreciates and respects the work and a...,0,0,0,0,0.0,0.48291


# Getting speakers info (Raph)

In [1]:
import numpy as np
import pandas as pd
import bz2
import json

In [2]:
with bz2.open('./quotes-2019-nytimes.json.bz2', "rt") as bzinput:
    lines = []
    for i, line in enumerate(bzinput):
        if i == 50: break
        tweets = json.loads(line)
        lines.append(tweets)

In [3]:
dfquotes = pd.DataFrame(lines)

In [4]:
#Remove quotes without know speaker
dfquotes = dfquotes[dfquotes.speaker != 'None']

# Find speakers with one unique id
dfquotes = dfquotes[dfquotes.qids.apply(lambda x: len(x)) == 1]

# List to string the qids column
dfquotes['qids'] = dfquotes['qids'].apply(lambda x: x[0])

#Change and rename index for future merging
dfquotes = dfquotes.set_index('qids')
dfquotes.index.rename('id', inplace=True)

In [5]:
#Load speaker attributes parquet file
dfattrib = pd.read_parquet("speaker_attributes.parquet\speaker_attributes.parquet")

In [6]:
#Load QID correspondance file and remove description column
dflabels = pd.read_csv('wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col = 'QID')
dflabels = dflabels.drop(['Description'], axis=1)

In [7]:
#Merge quotes file and speaker attributes file based on the unique qid of the speaker
dfmerged_new = dfquotes.merge(dfattrib.set_index('id'), left_index=True, right_index=True)

In [8]:
for col in dfmerged_new.columns:
    try:
        dfmerged_new.loc[:, col] = dfmerged_new.loc[:, col].apply(lambda x: [dflabels.loc[nat].item() for nat in x] if x is not None else x)
    except:
        continue

In [9]:
dfmerged_new = dfmerged_new.drop(['numOccurrences', 'urls', 'probas', 'phase'], axis=1)

In [10]:
dfmerged_new

Unnamed: 0_level_0,quoteID,quotation,speaker,date,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,label,candidacy,type,religion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Q105756,2019-04-02-001128,a champion figure skater switching to roller s...,John Updike,2019-04-02 14:58:33,[John Hoyer Updike],[+1932-03-18T00:00:00Z],[United States of America],[male],1386702708,,,"[poet, writer, novelist, essayist, playwright,...",,,John Updike,,item,[Anglicanism]
Q1855840,2019-10-31-056366,"It puts me in a predicament,",Xavier Becerra,2019-10-31 16:45:15,,[+1958-01-26T00:00:00Z],[United States of America],[male],1393554580,,B000287,"[politician, lawyer]",[Democratic Party],,Xavier Becerra,,item,[Roman Catholic]
Q7817334,2019-08-15-002017,A Senator we can call our own.,Tom Rath,2019-08-15 22:36:33,,[+1975-00-00T00:00:00Z],[United States of America],[male],1387887635,,,[writer],,,Tom Rath,,item,
Q3074013,2019-07-22-032883,"It's a success, a relief and a technical feat,",Florence Parly,2019-07-22 02:37:50,[Florence Marie Jeanne Parly],[+1963-05-08T00:00:00Z],[France],[female],1388274070,,,[politician],[Socialist Party],,Florence Parly,,item,
Q7812406,2019-01-13-028337,It's crazy. I can't even really explain it rig...,Todd Gurley II,2019-01-13 15:55:44,[Todd Gurley II],[+1994-08-03T00:00:00Z],[United States of America],[male],1368251541,[African Americans],,[American football player],,,Todd Gurley,,item,
Q977546,2019-06-19-008685,As soon as the gumballs hit or don't hit Luke ...,Errol Morris,2019-06-19 15:25:07,,[+1948-02-05T00:00:00Z],[United States of America],[male],1391704920,,,"[film director, screenwriter, non-fiction writ...",,,Errol Morris,,item,[atheism]
Q50049,2019-10-02-063708,"I've lived long enough, being born in 1961, th...",Catherine Opie,2019-10-02 21:42:56,,[+1961-04-14T00:00:00Z],[United States of America],[female],1395549466,[White Americans],,"[artist, photographer, university teacher, aca...",,,Catherine Opie,,item,
Q22686,2019-02-15-009152,Asylum seekers in Mexico face a heightened ris...,President Donald Trump,2019-02-15 03:20:00,"[Donald John Trump, Donald J. Trump, Trump, Th...",[+1946-06-14T00:00:00Z],[United States of America],[male],1395141850,,,"[business magnate, investor, restaurateur, non...","[Republican Party, Independence Party of Ameri...",[Bachelor of Science],Donald Trump,"[2016 United States presidential election, 202...",item,[Presbyterianism]
Q7154794,2019-10-26-028940,I've never seen the base more energized than i...,Paula White,2019-10-26 20:51:55,"[Paula Michelle Furr, Paula Michelle White-Cai...",[+1966-04-20T00:00:00Z],[United States of America],[female],1378795903,,,"[writer, televangelist, pastor]",,,Paula White-Cain,,item,
Q57552119,2019-03-31-006363,but it didn't seem like this worldwide big to-do.,Steve Gold,2019-03-31 15:40:51,,[+1985-03-01T00:00:00Z],[United States of America],[male],1346415165,,,"[real estate broker, reality television partic...",,,Steve Gold,,item,
