# Story data analyses
---

## import libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import numpy as np
import glob as gl
import plotly.plotly as pl
from datetime import datetime as dt
from datetime import timedelta as td
from scipy import stats as st

## import data
import csv files that were exported from 'dataframe_processing_and_export.ipynb' and then moved to EXPORT folder.

In [3]:
#fetch story data from csv files stored in /documents and with filename format "initials_type.csv" (e.g. og_pm, og_su). Store the fetched data as pandas dataframes inside dictionary d
storydata1 = gl.glob('EXPORT/*_pm.csv')
storydata2 = gl.glob('EXPORT/*_su.csv')

d1 = {}#pm DBs

d_raw = {}#su DBs

#load csvs as dataframes into separate dictionaries
for story in storydata1:
    d1[story[7:12]] = pd.read_csv(story, sep=',', encoding='latin-1')

for story in storydata2:
    d_raw[story[7:12]] = pd.read_csv(story, sep=',', encoding='latin-1')    

## functions

### pre-processing

- **format_timeVals**: prepares all imported time data for any further processing.

- **distribute_ids_to_adjacent_places**: takes raw story units as inputs and allocates new IDs to these units based on whether they are adjacent to story units referring to the same place.

- **consolidate_adjacent_places**: takes the data processed by distribute_ids_to_adjacent_places and aggregates rows based on the newly-allocated IDs into the dictionary of dataframes entitled *d_con*.

- **add_su_buffer**: adds temporal buffers of user's choice (in seconds) to the input dataframe.

### statistics

- **new_timeVals**: calculate new fields time_length, which provides each story unit length as a timedelta value, and num_minutes, which provides this value as a real number.

- **calcStats**: calculates some statistics based on story unit / place mention dynamics. That is, whether a place mention with a *time* val falls between *time_start* and *time_end* vals (>=,<). When a mention occurs at the temporal breakpoint between two story units, the story unit that follows the first is the one that the mention becomes associated with [...]

### aggregation

- **aggregatedByPlaces**: input data is copied into a new dictionary entitled *d_byPlace* and aggregated by place name, producing data whereby each row represents a totality of attributes pertaining to the story units that related to the said place.

- **timeSum**: is called from within aggregatedByPlaces to enable the sum of the time_length field during aggregation.

---
the following cell needs to be run before proceeding

In [4]:
def format_timeVals(name, df):
    if name[-2:] == 'pm':
        df['time'] = pd.to_datetime(df.time, format='%Y-%m-%d %H:%M:%S')
    else:
        df['time_start'] = pd.to_datetime(df.time_start, format='%Y-%m-%d %H:%M:%S')
        df['time_end'] = pd.to_datetime(df.time_end, format='%Y-%m-%d %H:%M:%S')

#attribute new IDs to story units so that each unique ID represents a change in spatial discourse instead of a story unit change (i.e. each ID represents a spatially distinct chunk of discourse)
def distribute_ids_to_adjacent_places(name, df):
    for i in df.index:
        if df.loc[i, 'id'] == 1:
            df.loc[i, 'agg_su_id'] = df.loc[i, 'id']
        elif df.loc[i, 'place'] == df.loc[i-1, 'place']:
            df.loc[i, 'agg_su_id'] = df.loc[i-1, 'agg_su_id']
        else:
            df.loc[i, 'agg_su_id'] = df.loc[i, 'id']

#aggregate these rows by these new IDs
def consolidate_adjacent_places(name, df, append):
    d_raw[name + append] = df.groupby(['agg_su_id','place','scale_order'], as_index=False).agg({
        'id': lambda x: x.astype('str').str.cat(sep=';'),
        'su_num': lambda x: x.astype('str').str.cat(sep=';'),
        'time_start':'first',
        'time_end':'last'})
    d_raw[name + append] = d_raw[name + append][['agg_su_id','id','su_num','place','scale_order','time_start','time_end']]
        
def add_su_buffer(name, df, buffer):
    #subtract timedelta object from each datetime object in column and assign new values
    df['time_start'] = df['time_start'].map(lambda x: x - td(seconds=buffer))
    df['time_end'] = df['time_end'].map(lambda x: x + td(seconds=buffer))
    #add 5 seconds to first value to make sure it remains at 00:00:00
    df.iloc[0]['time_start'] = df.iloc[0]['time_start'] + td(seconds=buffer)

def new_timeVals(name, df):
    df['time_length'] = df.time_end - df.time_start
    df['num_minutes'] = df['time_length'].map(lambda x: x.total_seconds()/60)
    #exported pd.datetime values are imported and need to be trimmed into datetime objects w format HH:MM:SS
    #df['time_length'] = pd.to_datetime(df.time_length.str.replace('.000000000','').str.replace('[0-9]+ days?,? ',''), format='%H:%M:%S')
    #this datetime object is then converted into a list of 3 time values representing [hours,minutes,seconds]
    df['time_length'] = df['time_length'].map(lambda x: str(x).split(' ')[2].split(':'))        
    
def calcStats(name1, df1, name2, df2):
    
    r_pm = range(len(df1.index))
    r_su = range(len(df2.index))
    
    df2['mention_freq'] = 0
    df2['mention_index'] = ''
    df2['mention_places'] = ''
    
    df2['mention_match_freq'] = 0
    df2['mention_match_index'] = ''
    df2['mention_match_places'] = ''
    
    df2['mention_coarser_match_freq'] = 0
    df2['mention_coarser_match_index'] = ''
    df2['mention_coarser_match_places'] = ''
    
    df2['mention_finer_match_freq'] = 0
    df2['mention_finer_match_index'] = ''
    df2['mention_finer_match_places'] = ''
    
    try:
         for i in r_su:
            a = dates.date2num(df2.iloc[i]['time_start'])
            b = dates.date2num(df2.iloc[i]['time_end'])
            place1 = df2.iloc[i]['place']
            x = 0
            x1 = ''
            x11 = ''
            y = 0
            y1 = ''
            y11 = ''
            y2 = 0
            y21 = ''
            y22 = ''
            y3 = 0
            y31 = ''
            y32 = ''

            for j in r_pm:
                c = dates.date2num(df1.iloc[j]['time'])
                if (c >= a) and (c < b):
                    _id = str(df1.iloc[j]['id'])
                    place2 = df1.iloc[j]['place']

                    x += 1
                    x1 += (";" + _id)
                    x11 += (";" + place2)
                    if place2 == place1:
                        y += 1
                        y1 += (";" + _id)
                        y11 += (";" + place2)
                    elif place1 in place2:
                        y2 += 1
                        y21 += (";" + _id)
                        y22 += (";" + place2)
                    elif place2 in place1:
                        y3 += 1
                        y31 += (";" + _id)
                        y32 += (";" + place2)
                    else:
                        pass
                else:
                    pass
            else:
                df2.iloc[i, df2.columns.get_loc('mention_freq')] = x
                df2.iloc[i, df2.columns.get_loc('mention_index')] = x1
                df2.iloc[i, df2.columns.get_loc('mention_places')] = x11

                df2.iloc[i, df2.columns.get_loc('mention_match_freq')] = y
                df2.iloc[i, df2.columns.get_loc('mention_match_index')] = y1
                df2.iloc[i, df2.columns.get_loc('mention_match_places')] = y11
                
                df2.iloc[i, df2.columns.get_loc('mention_finer_match_freq')] = y2
                df2.iloc[i, df2.columns.get_loc('mention_finer_match_index')] = y21
                df2.iloc[i, df2.columns.get_loc('mention_finer_match_places')] = y22
                
                df2.iloc[i, df2.columns.get_loc('mention_coarser_match_freq')] = y3
                df2.iloc[i, df2.columns.get_loc('mention_coarser_match_index')] = y31
                df2.iloc[i, df2.columns.get_loc('mention_coarser_match_places')] = y32


    except:
        print(name1, name2, place1, place2, i, j, _id)
        raise

#look at unit-mention dynamics by place (total of all spatial units in a same place within a story), instead of spatial units.
def aggregatedByPlaces(name, df, append):
    d_raw[name+append] = df.groupby(['place','scale_order'], as_index=False).agg({
        'agg_su_id': lambda x: x.astype('str').str.replace('.0','').str.cat(sep=';'), #duplicates need to be removed
        'id': lambda x: x.astype('str').str.cat(sep=';'),
        'su_num': lambda x: x.astype('str').str.cat(sep=';'),
        'time_start':'first',
        'time_end':'last',
        'time_length': lambda x: timeSum(x),
        'num_minutes': 'sum',
        'mention_index': lambda x: x.astype('str').str.cat(sep='').strip(';').replace('nan',''),
        'mention_match_index': lambda x: x.astype('str').str.cat(sep='').strip(';').replace('nan',''),
        'mention_coarser_match_index': lambda x: x.astype('str').str.cat(sep='').strip(';').replace('nan',''),
        'mention_finer_match_index': lambda x: x.astype('str').str.cat(sep='').strip(';').replace('nan',''),
        'mention_places': lambda x: x.astype('str').str.cat(sep='').strip(';').replace('nan',''),
        'mention_coarser_match_places': lambda x: x.astype('str').str.cat(sep='').strip(';').replace('nan',''),
        'mention_match_places': lambda x: x.astype('str').str.cat(sep='').strip(';').replace('nan',''),
        'mention_finer_match_places': lambda x: x.astype('str').str.cat(sep='').strip(';').replace('nan',''),
        'mention_freq':'sum',
        'mention_coarser_match_freq':'sum',
        'mention_match_freq':'sum',
        'mention_finer_match_freq':'sum'})
    
def timeSum(x):
    #input for this function is a list of strings representing time in the format HH:MM:SS
    tdeltas = []
    for i in x:
        #x is a list whose values reflect the grouping logic of the groupby function.
        #timedeltas are represented in absolute seconds
        tdeltas.append(td(hours=int(i[0]),minutes=int(i[1]),seconds=int(i[2])))
    #adding a timedelta object (td) to the sum will provoke the sums output to be in td format
    return sum(tdeltas, td())

## Executing the functions
---
running these functions uses loops to cycle through each story within a dictionary. Each dictionary contains a copy of all 10 stories to which these functions need to be applied in a different order depending on what we want to test.

For the present analysis, we will apply 3 database formats to all 10 stories, each will require its own pipeline of functions, the formats include:
1. a format with each aggregated story unit and attributes about that unit's relation with simultaneous mentions of the place.
2. a format with each aggregated story unit (with its temporal boundaries extended by 5 seconds) and attributes about that unit's relation with simultaneous mentions of the place.*
3. a format with each aggregated story unit (with its temporal boundaries extended by 10 seconds) and attributes about that unit's relation with simultaneous mentions of the place.*

*adding buffers at the beginning and end, but especially beginning, of spatal discourse units will make up for a story unit that may have started after a mention which provoked it.

each of the previous databases can be aggregated by place so as to produce more holistic and summative views on the relation between each individual place in a story and simultaneous mentions of that place.

4. a format with each unique place in a story and attributes about that place's relation with simultaneous mentions of the place.

let's format the place mentions first, once and for all:

In [5]:
for k, v in d1.items():
    format_timeVals(k, v)

create 3 new dictionaries from d_raw to contain these 3 formats:

In [6]:
for k, v in zip(list(d_raw.keys()),list(d_raw.values())):
    format_timeVals(k, v)
    distribute_ids_to_adjacent_places(k, v)
    consolidate_adjacent_places(k, v, '_2')
    consolidate_adjacent_places(k, v, '_2_b5')
    consolidate_adjacent_places(k, v, '_2_b10')
#if we are to reuse the same consolidate_adjacent_places function, we need to pass an argument that will generate a new df within the same dictionary (we cannot dynamically create new dictionaries using an argument passed to a new function, but we CAN create new dfs within an existing dict this way!)

#from this, create new dictionaries... for simplicity's sake

d_2 = {}#consolidated su DBs
d_2_b5 = {}#consolidated and expanded su DBs (5 seconds before beginning of each story unit and 5 seconds after)
d_2_b10 = {}#consolidated and expanded su DBs (10 seconds before beginning of each story unit and 10 seconds after)

#distribute these new dataframes to the new dicts
for k, v in d_raw.items():
    if k[-1:] == '2':
        d_2[k] = pd.DataFrame(v)
    elif k[-1:] == '5':
        d_2_b5[k] = pd.DataFrame(v)
    elif k[-1:] == '0':
        d_2_b10[k] = pd.DataFrame(v)

1. a format with each aggregated story unit and attributes about that unit's relation with simultaneous place mentions.

In [7]:
for (k1, v1), (k2, v2) in zip(sorted(d1.items()), sorted(d_2.items())):
    new_timeVals(k2, v2)
    calcStats(k1, v1, k2, v2)

2. a format with each aggregated story unit (with its temporal boundaries extended by 5 seconds) and attributes about that unit's relation with simultaneous mentions of the place.

In [8]:
for (k1, v1), (k2, v2) in zip(sorted(d1.items()), sorted(d_2_b5.items())):
    add_su_buffer(k2, v2, 5)
    new_timeVals(k2, v2)
    calcStats(k1, v1, k2, v2)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



3. a format with each aggregated story unit (with its temporal boundaries extended by 10 seconds) and attributes about that unit's relation with simultaneous mentions of the place.

In [9]:
for (k1, v1), (k2, v2) in zip(sorted(d1.items()), sorted(d_2_b10.items())):
    add_su_buffer(k2, v2, 10)
    new_timeVals(k2, v2)
    calcStats(k1, v1, k2, v2)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Now, we want to also produce new dataframes that will contain

1. each story broken down by unique place, rather than changes in spatial discourse. Therefore, we will be able to view a summary of each distinct place's relation to story units. We also want to see 
2. an aggregation of all these places across 10 stories, so we can have a more totalizing view on how scale plays into this relationship. 

We want these datasets for all three dictionaries (i.e. (1) the one with 5-second extended spatial discourse units, (2) the one with 10-second extended spatial discourse units, (3) the one with unchanged spatial discourse units).

In [10]:
for k, v in d_2.items():
    aggregatedByPlaces(k, v, '_p')
for k, v in d_2_b5.items():
    aggregatedByPlaces(k, v, '_p')
for k, v in d_2_b10.items():
    aggregatedByPlaces(k, v, '_p')

d_2_p = {}#consolidated su -> place-aggregated DBs
d_2_b5_p = {}#consolidated and expanded su DBs (5 seconds before beginning of each story unit and 5 seconds after) -> place-aggregated DBs
d_2_b10_p = {}#consolidated and expanded su DBs (10 seconds before beginning of each story unit and 10 seconds after) ->place-aggregated DBs

#the previous function added these new dataframes to our d_raw dictionary. Let's copy them to their own dictionaries for simplicity
for k, v in d_raw.items():
    if k[-3:] == '2_p':
        d_2_p[k] = pd.DataFrame(v)
    elif k[-3:] == '5_p':
        d_2_b5_p[k] = pd.DataFrame(v)
    elif k[-3:] == '0_p':
        d_2_b10_p[k] = pd.DataFrame(v)

now aggregate dataframes by new story unit ids (generated by 'consolidate_adjacent_places') using 'aggregatedByAggId' and then aggregated by place name using 'aggregatedByPlaces'.

In [21]:
d_2.keys()

dict_keys(['eh_su_2', 'ek_su_2', 'fv_su_2', 'ct_su_2', 'ep_su_2', 'jr_su_2', 'bn_su_2', 'ap_su_2', 'og_su_2', 'jm_su_2'])

In [11]:
total_2 = pd.concat(d_2)
total_b5 = pd.concat(d_2_b5)
total_b10 = pd.concat(d_2_b10)

p_all = aggregatedByPlaces('2', total_2, '_p_all')
b5_p_all = aggregatedByPlaces('2_b5', total_b5, '_p_all')
b10_p_all = aggregatedByPlaces('2_b10', total_b10, '_p_all')

In [136]:
d_2['eh_su_2'].dtypes

agg_su_id                              float64
id                                      object
su_num                                  object
place                                   object
scale_order                              int64
time_start                      datetime64[ns]
time_end                        datetime64[ns]
time_length                             object
num_minutes                            float64
mention_freq                             int64
mention_index                           object
mention_places                          object
mention_match_freq                       int64
mention_match_index                     object
mention_match_places                    object
mention_coarser_match_freq               int64
mention_coarser_match_index             object
mention_coarser_match_places            object
mention_finer_match_freq                 int64
mention_finer_match_index               object
mention_finer_match_places              object
dtype: object

generate tables showing regression analyses btwn amount of minutes of discourse time regarding a given place (x) and number of simultaneous mentions of that place (y).

*the output here needs to be played with manually. Change the values inside the regression_table function to control which data are being tested.

In [175]:
def regression_table(name, df):
    keys = ['slope','intercept','r_value','p_value','std_err','n']
    newtable[name] = dict.fromkeys(keys)    
    #slope, intercept, r_value, p_value, std_err = st.linregress(df.loc[df['scale_order'] == 3, 'mention_match_freq'],df.loc[df['scale_order'] == 3, 'num_minutes'])
    #newtable[name] = {'slope':slope,'intercept':intercept,'r_value':r_value,'p_value':p_value,'std_err':std_err,'n':df.loc[df['scale_order'] == 3].shape[0]}
    slope, intercept, r_value, p_value, std_err = st.linregress(df.loc[(df['scale_order'] == 3) | (df['scale_order'] == 2), 'mention_coarser_match_freq'],df.loc[(df['scale_order'] == 3) | (df['scale_order'] == 2), 'num_minutes'])
    newtable[name] = {'slope':slope,'intercept':intercept,'r_value':r_value,'p_value':p_value,'std_err':std_err,'n':df.loc[(df['scale_order'] == 3) | (df['scale_order'] == 2)].shape[0]}
    
def append_mean(df):
    mean_row = df.mean()
    mean_df = pd.DataFrame(mean_row)
    mean_df.columns = ['mean']
    mean_df = mean_df.transpose()
    df = df.append(mean_df)
    return df

newtable = {}
for k, v in d_2.items():
    regression_table(k, v)
stats_df = pd.DataFrame(newtable).transpose()
stats_df = append_mean(stats_df)
stats_df

Unnamed: 0,intercept,n,p_value,r_value,slope,std_err
ap_su_2,2.843333,6.0,0.097278,-0.7332,-1.131667,0.524788
bn_su_2,1.216667,6.0,0.004745,0.943216,2.341667,0.412342
ct_su_2,0.999921,29.0,0.000288,0.62511,1.452969,0.349149
eh_su_2,2.089431,25.0,0.144358,0.300538,0.932927,0.617346
ek_su_2,1.675926,13.0,0.944681,-0.021399,-0.080093,1.12824
ep_su_2,1.86935,32.0,0.305459,0.186996,0.538079,0.516089
fv_su_2,1.134702,38.0,0.930381,0.014662,0.014702,0.167113
jm_su_2,1.266866,37.0,0.804444,0.042133,0.093214,0.373624
jr_su_2,4.475,3.0,0.594278,0.595033,0.908333,1.226869
og_su_2,3.778283,14.0,0.435952,0.226606,0.773737,0.96003


scratch

In [None]:
pd.options.display.max_rows = 500
#display(d22['ep_su'])
#d2['ap_su'].loc[d2['ap_su']['scale_order'] == 6].shape[0]
#print(d22['bn_su'].shape[0], total['bn']

#aggregatedByPlaces('total',total_agg)
def searchit(d_raw, search_id):
    for k, v in d_raw.items():
        print(k)
        print(v.loc[v['id'] == search_id].place)

searchit(d_raw, 13)


In [178]:
regression_table('2_b10_all_p', d_raw['2_b10_p_all'])
stats_df = pd.DataFrame(newtable).transpose()
append_mean(stats_df.iloc[0:3])


Unnamed: 0,intercept,n,p_value,r_value,slope,std_err
2_all_p,2.332115,88.0,6.494124e-07,0.501391,2.411058,0.448651
2_b10_all_p,3.128464,88.0,1.323567e-05,0.446114,1.827318,0.395303
2_b5_all_p,2.896135,88.0,1.077113e-05,0.450214,1.884786,0.403094
mean,2.785571,88.0,8.218737e-06,0.465907,2.041054,0.415683


Unnamed: 0,intercept,n,p_value,r_value,slope,std_err
2_all,2.567988,26.0,7.051967e-10,0.894761,1.009669,0.102857
2_all_p,2.567988,26.0,7.051967e-10,0.894761,1.009669,0.102857
2_b10_all,3.285109,26.0,5.011017e-10,0.897855,1.038059,0.103908
2_b5_all,2.460418,26.0,3.256865e-10,0.901622,1.047032,0.102528
ap_su_2_p,-0.726059,7.0,1.100208e-06,0.996812,0.577452,0.02067
bn_su_2_p,-6.558942,5.0,0.01540827,0.944873,2.513371,0.502863
ct_su_2_p,-0.594874,5.0,8.75665e-05,0.998254,1.348526,0.04607
eh_su_2_p,-1.858104,3.0,0.1056778,0.986254,1.178642,0.19747
ek_su_2_p,-11.849295,4.0,0.02227607,0.977724,3.980363,0.604218
ep_su_2_p,0.229423,5.0,0.002746016,0.982609,0.76925,0.083929
