to start, we import the needed python libraries:

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as dates
from matplotlib.lines import Line2D
import glob as gl

We then import the spreadsheet (.csv) data into python 'pandas' dataframes.

Several modifications to the original data were carried out on the data being imported:
- Some 'journey' story units (FV 15-18, EP 32-35, OG 63-65) were combined into one since they had been erroneously created as several story units consisting of the same journey. cumulative time stamps were erased to prevent journey segments from rendering on graphs. Refer to clip_time or to original db for reference.
- OG story units 39 and 40 to correct time error.
- CT: 7 journeys were in original dataset, but only one was retained in atlascine. This is because these 6 other journeys recount movement of others (not the storyteller/protagonist). These journeys were kept in the updated datasets and therefore render on the present graphs.

Because this data is public, and certain fields revealed the identity of the storyteller, the following columns have been removed for the current datasets
- '*_pm.csv' data sets have columns H-L removed (index 7-10)
- '*_su.csv' data sets have columns J, K, and N-W removed (index 9, 10, 13-22)

In [6]:
#fetch story data from csv files stored in /documents and with filename format "initials_type.csv" (e.g. og_pm, og_su). Store the fetched data as pandas dataframes inside dictionary d
storydata1 = gl.glob('documents_place_mentions/*.csv')
storydata2 = gl.glob('documents_story_units/*.csv')
i = 0
j = 0
d1 = {}
d2 = {}

#load csvs as dataframes into a dictionary
for story in storydata1:
    d1[story[25:30]] = pd.read_csv(story, sep=',', encoding='latin-1')
    i += 1

for story in storydata2:
    d2[story[22:27]] = pd.read_csv(story, sep=',', encoding='latin-1')
    j += 1

The data then needs to be cleaned up and organized for visualization:

In [3]:
#replace missing data with "null"
def cleanNA(df):
    df.dropna(axis=1, how='all', inplace=True)
    df.dropna(axis=0, how='all', inplace=True)
    df.fillna('null', inplace=True)

#name columns appropriately
def renameCols(name, df):
    if name[-2:] == 'pm':
        df.columns = ['id','session_num','time_clip','time', 'place', 'place_raw','scale']
    else:
        df.columns = ['id','session_num', 'su_num','time_clip_start', 'time_clip_end', 'time_start', 'time_end', 'place', 'place_raw', 'scale', 'journey']

def cleanVals(name, df):
    df['scale'] = df.scale.str.lower()
    df.loc[df['scale'].str.contains('unknown', case=False), 'scale'] = 'null'
    df['scale'].replace('city / area\n', 'city / area', inplace=True)
    #remove rows that are basically empty (make sure there are no actual story units containing no start and end-time, of course, but this should not be the case)
    if name[-2:] == 'su':
        nullList = df.loc[df['time_start'].str.contains('null', case=False) & df['time_end'].str.contains('null', case=False)].index
        for null in nullList:
            df.drop(null, inplace=True)
    #add more cleaning functions if needed
    
def timeVals(name, df):
    if name[-2:] == 'pm':
        df['time'] = pd.to_datetime(df.time, format='%H:%M:%S')
    else:
        df['time_start'] = pd.to_datetime(df.time_start, format='%H:%M:%S')
        df['time_end'] = pd.to_datetime(df.time_end, format='%H:%M:%S')

def newCols(name, df):    
    if name[-2:] == 'su':
        df.loc[df['journey'].str.contains('journey', case=False, na=False), 'scale'] = 'journey' #give scale "journey" to units that are journeys
    df['scale_order'] = df['scale']
    df.loc[df['scale'].str.contains('journey'), 'scale_order'] = '1'
    df.loc[df['scale'].str.contains('local'), 'scale_order'] = '3'
    df.loc[df['scale'].str.contains('very local'), 'scale_order'] = '2'    
    df.loc[df['scale'].str.contains('city / area'), 'scale_order'] = '4'
    df.loc[df['scale'].str.contains('region'), 'scale_order'] = '5'
    df.loc[df['scale'].str.contains('country'), 'scale_order'] = '6'
    df.loc[df['scale'].str.contains('continent'), 'scale_order'] = '7'
    df.loc[df['scale'].str.contains('null'), 'scale_order'] = '8'
    df['scale_order'] = df.scale_order.astype(int)

for k, v in d1.items():
    cleanNA(v)
    renameCols(k, v)
    cleanVals(k, v)
    timeVals(k, v)
    newCols(k, v)

for k, v in d2.items():
    cleanNA(v)
    renameCols(k, v)
    cleanVals(k, v)
    timeVals(k, v)
    newCols(k, v)

calculate some statistics

- create new column
- calculate number of rows with 'time' val falling between time_start and time_end vals (><)
- insert val into new column

- create new column
- calculate number of rows with 'time' val falling between time_start and time_end vals (><) that have the same 'place' val
- insert val into new column

In [4]:
def calcStats(name1, df1, name2, df2):
    r_pm = range(len(df1.index))
    r_su = range(len(df2.index))
    
    df2['mention_freq'] = 0
    df2['mention_index'] = ''
    df2['mention_places'] = ''
    
    df2['mention_match_freq'] = 0
    df2['mention_match_index'] = ''
    df2['mention_match_places'] = ''
    
    df2['mention_coarser_match_freq'] = 0
    df2['mention_coarser_match_index'] = ''
    df2['mention_coarser_match_places'] = ''
    
    df2['mention_finer_match_freq'] = 0
    df2['mention_finer_match_index'] = ''
    df2['mention_finer_match_places'] = ''
    
    for i in r_su:
        a = dates.date2num(df2.iloc[i]['time_start'])
        b = dates.date2num(df2.iloc[i]['time_end'])
        place1 = df2.iloc[i]['place']
        x = 0
        x1 = ''
        x11 = ''
        y = 0
        y1 = ''
        y11 = ''
        y2 = 0
        y21 = ''
        y22 = ''
        y3 = 0
        y31 = ''
        y32 = ''
        
        for j in r_pm:
            c = dates.date2num(df1.iloc[j]['time'])
            if (c >= a) and (c <= b):
                _id = str(df1.iloc[j]['id'])
                place2 = df1.iloc[j]['place']
                
                x += 1
                x1 += (";" + _id)
                x11 += (";" + place2)
                if place2 == place1:
                    y += 1
                    y1 += (";" + _id)
                    y11 += (";" + place2)
                elif (place1 in place2) and (place1 != place2):
                    y2 += 1
                    y21 += (";" + _id)
                    y22 += (";" + place2)
                elif (place2 in place1) and (place2 != place1):
                    y3 += 1
                    y31 += (";" + _id)
                    y32 += (";" + place2)
                else:
                    pass
        else:
            df2.iloc[i, df2.columns.get_loc('mention_freq')] = x
            df2.iloc[i, df2.columns.get_loc('mention_index')] = x1
            df2.iloc[i, df2.columns.get_loc('mention_places')] = x11
            
            df2.iloc[i, df2.columns.get_loc('mention_match_freq')] = y
            df2.iloc[i, df2.columns.get_loc('mention_match_index')] = y1
            df2.iloc[i, df2.columns.get_loc('mention_match_places')] = y11
            
            df2.iloc[i, df2.columns.get_loc('mention_coarser_match_freq')] = y2
            df2.iloc[i, df2.columns.get_loc('mention_coarser_match_index')] = y21
            df2.iloc[i, df2.columns.get_loc('mention_coarser_match_places')] = y22
            
            df2.iloc[i, df2.columns.get_loc('mention_finer_match_freq')] = y3
            df2.iloc[i, df2.columns.get_loc('mention_finer_match_index')] = y31
            df2.iloc[i, df2.columns.get_loc('mention_finer_match_places')] = y32

for (k1, v1), (k2, v2) in zip(sorted(d1.items()), sorted(d2.items())):
    calcStats(k1, v1, k2, v2)
    v1.to_csv(k1 + '.csv', sep=',')
    v2.to_csv(k2 + '.csv', sep=',')


KeyboardInterrupt: 