# Time series graph of mentions and story units by geographic scale

The current page shows a visualization of story units and place mentions by geographic scale, for each of 10 life stories.

to start, we import the needed python libraries:

In [107]:
%matplotlib notebook

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as dates
from matplotlib.lines import Line2D
import glob as gl

We then import the spreadsheet (.csv) data into python 'pandas' dataframes.

Several modifications to the original data were carried out on the data being imported:
- Some 'journey' story units (FV 15-18, EP 32-35, OG 63-65) were combined into one since they had been erroneously created as several story units consisting of the same journey. cumulative time stamps were erased to prevent journey segments from rendering on graphs. Refer to clip_time or to original db for reference.
- OG story units 39 and 40 to correct time error.
- CT: 7 journeys were in original dataset, but only one was retained in atlascine. This is because these 6 other journeys recount movement of others (not the storyteller/protagonist). These journeys were kept in the updated datasets and therefore render on the present graphs.

Because this data is public, and certain fields revealed the identity of the storyteller, the following columns have been removed for the current datasets
- '*_pm.csv' data sets have columns H-L removed (index 7-10)
- '*_su.csv' data sets have columns J, K, and N-W removed (index 9, 10, 13-22)

In [108]:
#fetch story data from csv files stored in /documents and with filename format "initials_type.csv" (e.g. og_pm, og_su). Store the fetched data as pandas dataframes inside dictionary d
storydata1 = gl.glob('documents_place_mentions/*.csv')
storydata2 = gl.glob('documents_story_units/*.csv')
i = 0
j = 0
d1 = {}
d2 = {}

#load csvs as dataframes into a dictionary
for story in storydata1:
    d1[story[25:30]] = pd.read_csv(story, sep=',', encoding='latin-1')
    i += 1

for story in storydata2:
    d2[story[22:27]] = pd.read_csv(story, sep=',', encoding='latin-1')
    j += 1

The data then needs to be cleaned up and organized for visualization:

In [109]:
#replace missing data with "null"
def cleanNA(df):
    df.dropna(axis=1, how='all', inplace=True)
    df.dropna(axis=0, how='all', inplace=True)
    df.fillna('null', inplace=True)

#name columns appropriately
def renameCols(name, df):
    if name[-2:] == 'pm':
        df.columns = ['id','session_num','time_clip','time', 'place', 'place_raw','scale']
    else:
        df.columns = ['id','session_num', 'su_num','time_clip_start', 'time_clip_end', 'time_start', 'time_end', 'place', 'place_raw', 'scale', 'journey']

def cleanVals(name, df):
    df['scale'] = df.scale.str.lower()
    df.loc[df['scale'].str.contains('unknown', case=False), 'scale'] = 'null'
    df['scale'].replace('city / area\n', 'city / area', inplace=True)
    #remove rows that are basically empty (make sure there are no actual story units containing no start and end-time, of course, but this should not be the case)
    if name[-2:] == 'su':
        nullList = df.loc[df['time_start'].str.contains('null', case=False) & df['time_end'].str.contains('null', case=False)].index
        for null in nullList:
            df.drop(null, inplace=True)
    #add more cleaning functions if needed
    
def timeVals(name, df):
    if name[-2:] == 'pm':
        df['time'] = pd.to_datetime(df.time, format='%H:%M:%S')
    else:
        df['time_start'] = pd.to_datetime(df.time_start, format='%H:%M:%S')
        df['time_end'] = pd.to_datetime(df.time_end, format='%H:%M:%S')

def newCols(name, df):    
    if name[-2:] == 'su':
        df.loc[df['journey'].str.contains('journey', case=False, na=False), 'scale'] = 'journey' #give scale "journey" to units that are journeys
    df['scale_order'] = df['scale']
    df.loc[df['scale'].str.contains('journey'), 'scale_order'] = '1'
    df.loc[df['scale'].str.contains('local'), 'scale_order'] = '3'
    df.loc[df['scale'].str.contains('very local'), 'scale_order'] = '2'    
    df.loc[df['scale'].str.contains('city / area'), 'scale_order'] = '4'
    df.loc[df['scale'].str.contains('region'), 'scale_order'] = '5'
    df.loc[df['scale'].str.contains('country'), 'scale_order'] = '6'
    df.loc[df['scale'].str.contains('continent'), 'scale_order'] = '7'
    df.loc[df['scale'].str.contains('null'), 'scale_order'] = '8'
    df['scale_order'] = df.scale_order.astype(int)

for k, v in d1.items():
    cleanNA(v)
    renameCols(k, v)
    cleanVals(k, v)
    timeVals(k, v)
    newCols(k, v)

for k, v in d2.items():
    cleanNA(v)
    renameCols(k, v)
    cleanVals(k, v)
    timeVals(k, v)
    newCols(k, v)

Here are some variables and functions we will need during the graph-creation process:

In [147]:
#the colour choices for when the data is in a default or 'selected' state
pm_color = '#cccccc'
su_color = '#bdd7e7'
pm_color_alt = '#252525'
su_color_alt = '#08519c'

#this function is what actually puts the data onto the graph
def generateVspans(name1, df1, name2, df2, ax):
    for i in range(len(df2.index)):
        ax.axvspan(xmin = dates.date2num(df2.iloc[i]['time_start']), xmax = dates.date2num(df2.iloc[i]['time_end']), ymin = df2.iloc[i]['scale_order']/8-(1/8), ymax = df2.iloc[i]['scale_order']/8, facecolor = su_color, alpha = 1, zorder=2, picker=5, label=df2.iloc[i]['place'], gid = 'su')
    for i in range(len(df1.index)):
        ax.axvspan(xmin = dates.date2num(df1.iloc[i]['time'])-0.0001, xmax = dates.date2num(df1.iloc[i]['time'])+0.0001, ymin = df1.iloc[i]['scale_order']/8-(1/8), ymax = df1.iloc[i]['scale_order']/8, facecolor = pm_color, alpha = 1, zorder=3, picker=5, label=df1.iloc[i]['place'], gid = 'pm')

In [148]:
#the main plotting function

figure = {}

def plotAll(name1, df1, name2, df2):
    
    #for calculating fig x-axis length (to make it proportional to each story's timespan)
    xaxisTime = str(df2.iloc[[-1]]['time_end'])
    xaxisHour = int(xaxisTime[17:19])
    xaxisMin = int(xaxisTime[20:22])
    
    fig = plt.figure(num=None, figsize=(25, 7), dpi=80, facecolor='beige', edgecolor='k')
    ax = plt.axes([0.05,0.2,((xaxisHour+(xaxisMin/60))/10)*1.5,0.7], xmargin=0.0, ymargin=0.0)
    
    invisible_plot = ax.plot_date(x=df2.time_start, y=df2.scale_order, marker = '|', markersize=40, color='b', alpha=0, zorder=3)#use time span of su data since they represent entire span of interview.
    mainplot = generateVspans(name1, df1, name2, df2, ax)
    
    xmin, xmax = fig.get_children()[1].get_xlim()
    ymin, ymax = fig.get_children()[1].get_ylim()
    
    legend_elements = [Line2D([0], [0], color=pm_color, lw=0.5, label='Place mention'), Line2D([0], [0], color=su_color, lw=5, label='Story unit', alpha=1)]
    ax.legend(handles=legend_elements, bbox_to_anchor=(1, 1.15))
    plt.title(name1[:2].upper(), y=1.02, x=0.1, fontsize = 35, color='gray')

    ax.set_xlabel('discourse time (hh:mm:ss)', fontsize=14, color='gray')
    ax.set_ylabel('geographic scale', fontsize=14, color='gray')

    hours = dates.HourLocator()
    minutes = dates.MinuteLocator(byminute=[15,30,45], interval=1)
    hoursFmt = dates.DateFormatter('%H:%M:%S')

    ax.xaxis.set_major_locator(hours)
    ax.xaxis.set_major_formatter(hoursFmt)
    ax.xaxis.set_minor_locator(minutes)

    ylocs = ax.set_yticks([2,3,4,5,6,7,8,9], minor=False)
    ylabels = ax.set_yticklabels(['journey','very local','local','city / area','region','country','continent','none'], minor=False)

    ax.xaxis.grid(color='#D3D3D3', linestyle='dashed', linewidth=0.5, zorder = 1)
    ax.hlines(y = [2,3,4,5,6,7,8,9], xmin = xmin, xmax = xmax, color='#D3D3D3', linestyle='dashed', linewidth=0.5, zorder = 1)

    plt.xticks(rotation=45)
    
    #the click function
    def onpick(event):
        
        mouseevent = event.mouseevent
        artist = event.artist
        lab = artist.get_label()
        gid = artist.get_gid()
        
        x_data = mouseevent.xdata
        y_data = mouseevent.ydata
        
        #delete previously-added onpick vlines and text
        del fig.axes[0].collections[0]
        
        #newtext = ax.text(x_data, y_data, lab, fontsize=12, gid = 'newtext')
        newtext = ax.annotate(lab, (0.2, 1.05), xycoords = 'axes fraction',fontsize=14, gid = 'text', color = 'k', backgroundcolor = 'white')
        #generate and add vlines around clicked element
        artists = fig.axes[0].get_children()
        
        vline_x1 = artist.xy[0][0]
        vline_x2 = artist.xy[2][0]
        newlines = ax.vlines(x = [vline_x1, vline_x2], ymin = ymin, ymax = 9, color='k', linestyle='dashed', linewidth=0.5, zorder = 6, gid = 'newlines')
        
        #toggle colors of other artists that have the same 'place' label
        for a in artists:
            try:
                lab_other = a.get_label()
                gid_other = a.get_gid()
                #revert altered colors to original colors
                if gid_other[3:] == 'alt':
                    a.set_gid(gid_other[:2])
                    if a.get_gid() == 'pm':
                        a.set_color(pm_color)
                        #artist.set_zorder(3)
                    else:
                        a.set_color(su_color)
                        #artist.set_zorder(2)
                #change colors if matching place label of clicked element
                elif lab_other == lab:
                    if gid_other == 'pm':
                        a.set_color(pm_color_alt)
                        #artist.set_zorder(5)
                    else:
                        a.set_color(su_color_alt)
                        #artist.set_zorder(4)
                    a.set_gid(gid_other+'_alt')
                else:
                    pass
            except TypeError:
                continue
                
        #adding code within function after the for loop seems to mess things up
        
    connection_id2 = fig.canvas.mpl_connect('pick_event', onpick)
    #https://stackoverflow.com/questions/24960910/how-can-i-pass-parameters-to-on-key-in-fig-canvas-mpl-connectkey-press-event/24960937
    #connection_id3 = fig.canvas.mpl_connect("motion_notify_event", lambda event: hover(event, mainplot, annot))
    
    #store figure in variable
    figure[name1[:2]] = fig
    

In [149]:
#each plot requires both the pm dataframe and su dataframe

for (k1, v1), (k2, v2) in zip(sorted(d1.items()), sorted(d2.items())):
    plotAll(k1, v1, k2, v2)

<IPython.core.display.Javascript object>

the graphs take a fixed width of 25 inches (units tbd), and calculate the length of the graph (x axis) as a proportion of this fixed viewpane. Thus, for example, OG's story, which is 5.13 hours long, takes up 51.3% of the width of the viewpane, multiplied by 1.5 to take up roughly 75% of the viewpane. In this way, each graph's x-axis length (plot width) appears proportional to the story's length (note that there are still very minute differences in absolute distance between hour-tick marks on the x-axis between graphs, whic has yet to be corrected).

- "Multi-place" story units are disaggregated into their respective places, each presented within its respective 'scale' category. Each place spans the whole timespan of the story unit from which it came.
- Because "Journey" story units tend to represent a consecutive series of places (movement between them in a single direction), they are not disaggregated in the same way as "multiplace" story units, and are instead presented as their own "journey" scale category.

scraps

In [113]:
#artists = figure['ap'].axes[0].get_children()
#gid_o = artists[100].get_gid()
#artists[100].set_gid(gid_o+"_alt")
#artists[100].get_gid()[3:]

#hi = figure['ap'].axes[0].get_children()
#del artists[0]
#artists
#figure['jm'].axes[0].get_children().remove('Text')
"""
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

i = 0
testerr = {}
for i in range(440):
    hi = figure['og'].axes[0].get_children()[i].get_gid()
    testerr[i] = hi
    i+=1
#figure['fv'].axes[0].lines[0].properties()['color']
#artists

del(d1['bn_pm'])
del(d1['ek_pm'])
del(d1['ct_pm'])
del(d1['eh_pm'])
del(d1['ap_pm'])
del(d1['ep_pm'])
del(d1['jm_pm'])
del(d1['jr_pm'])
del(d1['fv_pm'])
del(d2['bn_su'])
del(d2['ek_su'])
del(d2['ct_su'])
del(d2['eh_su'])
del(d2['ap_su'])
del(d2['ep_su'])
del(d2['jm_su'])
del(d2['jr_su'])
del(d2['fv_su'])

"""

"\ni = 0\ndick = {}\nfor i in range(440):\n    hi = figure['og'].axes[0].get_children()[i].get_gid()\n    dick[i] = hi\n    i+=1\n#figure['fv'].axes[0].lines[0].properties()['color']\n#artists\n"

{'og_pm':       id  session_num time_clip                time  \
0      1            1   0:01:15 1900-01-01 00:01:15   
1      2            1   0:01:30 1900-01-01 00:01:30   
2      3            1   0:01:35 1900-01-01 00:01:35   
3      4            1   0:01:40 1900-01-01 00:01:40   
4      5            1   0:01:42 1900-01-01 00:01:42   
5      6            1   0:06:00 1900-01-01 00:06:00   
6      7            1   0:13:45 1900-01-01 00:13:45   
7      8            1   0:22:30 1900-01-01 00:22:30   
8      9            1   0:23:10 1900-01-01 00:23:10   
9     10            1   0:25:03 1900-01-01 00:25:03   
10    11            1   0:28:04 1900-01-01 00:28:04   
11    12            1   0:28:20 1900-01-01 00:28:20   
12    13            1   0:20:05 1900-01-01 00:20:05   
13    14            1   0:30:10 1900-01-01 00:30:10   
14    15            1   0:30:15 1900-01-01 00:30:15   
15    16            1   0:30:20 1900-01-01 00:30:20   
16    17            1   0:38:30 1900-01-01 00:38:30   
