to start, we import the needed python libraries:

In [85]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import glob as gl
import numpy as np

We then import the spreadsheet (.csv) data into python 'pandas' dataframes.

Several modifications to the original data were carried out on the data being imported:
- Some 'journey' story units (FV 15-18, EP 32-35, OG 63-65) were combined into one since they had been erroneously created as several story units consisting of the same journey. cumulative time stamps were erased to prevent journey segments from rendering on graphs. Refer to clip_time or to original db for reference.
- OG story units 39 and 40 to correct time error.
- CT: 7 journeys were in original dataset, but only one was retained in atlascine. This is because these 6 other journeys recount movement of others (not the storyteller/protagonist). These journeys were kept in the updated datasets and therefore render on the present graphs.
- journey data in the journey field has been manipulated (in cases where journey was applied to only one location for unknown reasons). see original data on private servers for originals.
- multi-place story units were left as is (as several story units occurring simultaneously. This means that the total duration (sum) of all story units in a story will exceed the story's running time (e.g. CT).

Because this data is public, and certain fields revealed the identity of the storyteller, the following columns have been removed for the current datasets
- '*_pm.csv' data sets have columns H-L removed (index 7-10)
- '*_su.csv' data sets have columns J, K, and N-W removed (index 9, 10, 13-22)



In [2]:
#fetch story data from csv files stored in /documents and with filename format "initials_type.csv" (e.g. og_pm, og_su). Store the fetched data as pandas dataframes inside dictionary d
storydata1 = gl.glob('documents_place_mentions/*.csv')
storydata2 = gl.glob('documents_story_units/*.csv')
d1 = {}
d2 = {}

#load csvs as dataframes into a dictionary
for story in storydata1:
    d1[story[25:30]] = pd.read_csv(story, sep=',', encoding='latin-1')

for story in storydata2:
    d2[story[22:27]] = pd.read_csv(story, sep=',', encoding='latin-1')

The data then needs to be cleaned up and organized for visualization:

In [3]:
#replace missing data with "null"
def cleanNA(df):
    df.dropna(axis=1, how='all', inplace=True)
    df.dropna(axis=0, how='all', inplace=True)
    df.fillna('null', inplace=True)

#name columns appropriately
def renameCols(name, df):
    if name[-2:] == 'pm':
        df.columns = ['id','session_num','time_clip','time', 'place', 'place_raw','scale']
    else:
        df.columns = ['id','session_num', 'su_num','time_clip_start', 'time_clip_end', 'time_start', 'time_end', 'place', 'place_raw', 'scale', 'journey']

def cleanVals(name, df):
    df['scale'] = df.scale.str.lower()
    df.loc[df['scale'].str.contains('unknown', case=False), 'scale'] = 'null'
    df['scale'].replace('city / area\n', 'city / area', inplace=True)
    #remove rows that are basically empty (make sure there are no actual story units containing no start and end-time, of course, but this should not be the case)
    if name[-2:] == 'su':
        nullList = df.loc[df['time_start'].str.contains('null', case=False) & df['time_end'].str.contains('null', case=False)].index
        for null in nullList:
            df.drop(null, inplace=True)
    #add more cleaning functions if needed
    
def timeVals(name, df):
    if name[-2:] == 'pm':
        df['time'] = pd.to_datetime(df.time, format='%H:%M:%S')
    else:
        df['time_start'] = pd.to_datetime(df.time_start, format='%H:%M:%S')
        df['time_end'] = pd.to_datetime(df.time_end, format='%H:%M:%S')

def newCols(name, df):    
    if name[-2:] == 'su':
        df.loc[df['journey'].str.contains('journey', case=False, na=False), 'scale'] = 'journey' #give scale "journey" to units that are journeys
    df['scale_order'] = df['scale']
    df.loc[df['scale'].str.contains('journey'), 'scale_order'] = '1'
    df.loc[df['scale'].str.contains('local'), 'scale_order'] = '3'
    df.loc[df['scale'].str.contains('very local'), 'scale_order'] = '2'    
    df.loc[df['scale'].str.contains('city / area'), 'scale_order'] = '4'
    df.loc[df['scale'].str.contains('region'), 'scale_order'] = '5'
    df.loc[df['scale'].str.contains('country'), 'scale_order'] = '6'
    df.loc[df['scale'].str.contains('continent'), 'scale_order'] = '7'
    df.loc[df['scale'].str.contains('null'), 'scale_order'] = '8'
    df['scale_order'] = df.scale_order.astype(int)
    
for k, v in d1.items():
    cleanNA(v)
    renameCols(k, v)
    cleanVals(k, v)
    timeVals(k, v)
    newCols(k, v)

for k, v in d2.items():
    cleanNA(v)
    renameCols(k, v)
    cleanVals(k, v)
    timeVals(k, v)
    newCols(k, v)

In [50]:

hello = d1['ap_pm']['place']
hello

0                      Haiti
1                      Haiti
2                     France
3                      Haiti
4                      Haiti
5                      Haiti
6                      Haiti
7                     France
8                     France
9              Latin America
10                 Venezuela
11                     Haiti
12                 Venezuela
13             Latin America
14                     Haiti
15                     Haiti
16                     Haiti
17                     Haiti
18                     Haiti
19                     Haiti
20                       USA
21                     Haiti
22                     Haiti
23                     Haiti
24            Jérémie, Haiti
25                     Haiti
26                     Haiti
27     Port-au-Prince, Haiti
28                     Haiti
29                     Haiti
               ...          
302                      USA
303                    Haiti
304                    Haiti
305           

In [84]:
placelist = np.array

def createList(name, df):
    df_place = df['place'].values
    placelist.append(df_place)
    #pd.concat([placelist,df_place])

for k, v in d1.items():
    createList(k, v)



TypeError: append() missing 1 required positional argument: 'to_append'

export to csv (and then move to EXPORT folder!)

In [4]:
for (k1, v1), (k2, v2) in zip(sorted(d1.items()), sorted(d2.items())):
    v1.to_csv(k1 + '.csv', sep=',', index=False, encoding='latin-1')
    v2.to_csv(k2 + '.csv', sep=',', index=False, encoding='latin-1')