# Capella manuscript analysis

In [1]:
%load_ext autoreload

Load modules with helper functions.

In [2]:
%rehashx
%matplotlib inline
%autoreload 2

import bokeh
from bokeh.charts import Bar, output_notebook, show
from bokeh.layouts import row

import matplotlib.pyplot as plt

from medDiaJson import *
from roman_date import *

ImportError: No module named 'medDiaJson'

In [None]:
output_notebook()

## Build dataframe

First, build full dataframe for all manuscripts of the JSON file. Loading of JSON file is done in module. 

In [None]:
df = medDiaCon(237)

Convert date entries from roman numerals to arabic numerals. Only the first roman numeral is checked for. 
All additional information is neglected by this approach.

In [None]:
df['date'] = df['date'].apply(lambda row: from_roman(row)*100)

Give all available diagram types for Capella.

In [None]:
authorKey(df,'Capella','diaTyp')

Give all available dates.

In [None]:
authorKey(df,'Capella','date')


# Checks for missing data

Check for missing diagram types: Use override=True to disable data typ checking. 

First, check for empty diagram typ string. 

In [None]:
reducedData(df,[['author','Capella'],['diaTyp','']])

Then, check for 0.0 float entries. 

In [None]:
reducedData(df,[['author','Capella'],['diaTyp', 0.0]])

Thus, diagrams without attributes are either marked by 0.0 or an empty string. 

However, filtering for attribute names like M18.1 ... will neglect M0 entries anyway.

# Links to in Topoi-Database

To show the image for a given digramm ID, the id2image looks up the correct URL in the JSON file and displays the digilib image inside an iframe. All digilib tools should work as expected.

In [None]:
id2image(df,'MAPD0420')

Alternatively, using altId2image opens a new tab to edition.topoi

In [None]:
altId2image(df,'MAPD0420')

In [None]:
manID2image(df,'Z(3)8')

To include the descriptions of diagram types the following links to the digilib tool. 
For smaller screens, one can use the webbrowser package to open a new tab. 

In [None]:
import webbrowser
webbrowser.open('http://www.ancient-astronomy.org/webapplications/domenico/SliderDigilib.html')

For larger screens one can display the content inline.

In [None]:
from IPython.display import HTML
HTML('<iframe src=http://www.ancient-astronomy.org/webapplications/domenico/SliderDigilib.html + width=100% height=450></iframe>')

Can be useful to check validity of attributes etc.

To obtain the description of a diagram type use the following. Requieres more work for pretty printing...

In [None]:
diaTypeDescr(df,1)

In [None]:
imageList = []
listRows = []
listIDRows = []

url_start = 'http://repository.edition-topoi.org/MAPD/ReposMAPD/'

reddf = reducedData(df,[['author','Capella'],['textID',listofTextIDs[1]]])

for diaID in reddf.diaID:
    url = url_start + diaID
    r  = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data,'lxml')
    res = re.findall('([-\w]+\.(?:jpg))', str(soup))
    if res:
        imageList.append(url + '/' + res[0])

In [None]:
listofTextIDs = uniqueValues(reducedData(df,[['author','Capella']]),'textID')

In [None]:
textId2imagegrid(df,'Capella',listofTextIDs[1])

# Plotting diagram attributes and types

Next, replace missing diagram attributes which are encoded by '?'. I removed the diagrams without typ, i.e. 0.0 and ''  from the list of diagram types.

In [None]:
dftempList = []
typList = [x for x in authorKey(df,'Capella','diaTyp') if x not in ('',np.float64(0))]
for typ in typList:
    dftemp = reducedData(df,[['author','Capella'],['diaTyp',typ]])
    dftempList.append(dftemp)
dfCapella = pd.concat(dftempList).reset_index(drop=True).replace(['?'],[None])

Now we can count the diagram typs per year. The bokeh plot should be sorted by date.

In [None]:
counttempList = []

typList = [x for x in authorKey(df,'Capella','diaTyp') if x not in ('',np.float64(0))]

for year in [1200, 900, 1500, 1100, 1000]: 
    for typ in typList:
        cnt = dfCapella[(dfCapella['diaTyp']==typ) & (dfCapella['date']==year)].diaTyp.count()
        counttempList.append((typ,year,cnt))
dfCapellaCount = pd.DataFrame(counttempList)
dfCapellaCount.columns = ['diaTyp','date','count']
dfCapellaCount.sort_values(by='date',inplace=True)

In [None]:
p0 = Bar(dfCapellaCount,label='diaTyp',values='count',group='date')

show(p0)

It seems there are no entries for date 1000. check by 

In [None]:
dfCapella1000 = reducedData(df,[['author','Capella'],['date',np.int64(1000)]])
dfCapella1000

## Interactive selction of plot features

Using jupyter widget interact allows to define selectors for plotting.

### Number of attributes per diagram typ grouped by date

In [None]:
from ipywidgets import interact

from bokeh.io import push_notebook

def attrPlot(typ=18):
    dfbokeh = diaAttrPlot(df,'Capella',np.float64(typ))
    p1 = Bar(dfbokeh,values='attribute',group='date')
    show(p1)

In [None]:
typList = [x for x in authorKey(df,'Capella','diaTyp') if x not in ('', 0)]
interact(attrPlot,typ=typList)

### Number of diagrams per date for each diagram typ

In [None]:
from ipywidgets import interact

from bokeh.io import push_notebook

def diagIdUpdate(ide):
    dfp2 = dfCapellaCount[dfCapellaCount.diaTyp==ide]
    p2 = Bar(dfp2,label='diaTyp',values='count',group='date')
    show(p2)

In [None]:
ideList= list()
interact(diagIdUpdate,ide = (18,31))

## Occurance of diagram typs in each manuscript

First create list of DataFrames with requiered information.
Diagram typs range from 18 to 31 for the author Capella. 
Thus, a DataFrame with this range is created. 

Then, a dictionary is created which encodes the occurance of a diagram typ in the manuscript and applied to the column 'Count'. 
Finally, columns with the information of origin and date are added. 

In [None]:
biblioList = []

for biblio in [x for x in authorKey(df,'Capella','biblio')]:
    # Create reduced dataframe
    resTemp = reducedData(df,[['author','Capella'],['biblio',biblio]])
    # Drop all columns appart from date, biblio and diaTyp
    temp = resTemp.drop([x for x in resTemp.columns if x not in ('date','biblio','diaTyp')],axis=1)
    # create mapping for diagram types which are present in this dataframe diaTyp : 1
    d1 = {int(x):1 for x in list(temp['diaTyp'].values) if x not in ['']}
    # and for those not present diaTyp : 0
    s1 = set(x for x in temp['diaTyp'].values if x not in [''])
    s2 = set(range(18,32))
    d2 = {int(x):0 for x in list(s1 ^ s2)}
    # combine the two dicts
    d0 = d1.copy()
    d0.update(d2)
    #Works in python 3.5
    #d0 = {**d1, **d2}
    # Create new dataframe with all possible diagram types for Capella
    dfTEMP = pd.DataFrame(list(zip(list(range(18,32)),[0]*14)),index=range(14),columns=['diaTyp','Count'])
    # apply the mapping
    dfTEMP['Count'] = dfTEMP['diaTyp'].map(d0)
    # copy information for biblio and date
    dfTEMP['biblio'] = biblio
    dfTEMP['date'] = temp['date'][0]
    res = dfTEMP.sort_values(by='diaTyp',inplace=True)
    # append to list of dataframes
    biblioList.append(dfTEMP)

To give interact a list of origin names, we need a function, which operates on this names. 
For this purpose we can use list comprehension with string comparision. See line data = ...

In [None]:
from ipywidgets import interact

from bokeh.io import push_notebook, gridplot
from bokeh.plotting import figure
from bokeh.charts import Bar
from bokeh.models import FixedTicker, Legend
from bokeh.palettes import viridis

def biblioDiaTyp(biblio):
    # Select DataFrame from list by matching strings. 
    data = [biblioList[s] for s in range(len(biblioList)) if biblioList[s]['biblio'][0] in biblio][0]
    # Set title of plot for better info
    titleS = 'Origin: ' + data['biblio'][0] + '; Date: ' + str(data['date'][0]) + ' CE'
    # use palette=viridis(14) to get different color for every diagram typ bar
    b1 = Bar(data,title=titleS,label='biblio',
             values='Count',group='diaTyp',bar_width=1,ylabel='Diagrams',palette=viridis(14))
    b1.xaxis.major_label_orientation = "horizontal"
    b1.xaxis.axis_label=''
    b1.legend.location = "right_center"
    b1.legend.background_fill_alpha=0.5
    show(b1)

In [None]:
biblioKeys = sorted([x for x in authorKey(df,'Capella','biblio')])

In [None]:
interact(biblioDiaTyp,biblio=biblioKeys)

### Compare all manuscripts with same date

Takes some seconds to build all plots. Plots are sorted by number of occuring diagram types. Manuscripts with most diagram types come first. 

In [None]:
def plotDateGrid(date):
    # Assert given date is available. 
    assert date in authorKey(df,'Capella','date'), 'No entries for this date.' 
    # Create list of fitting dataframes
    tempList = [biblioList[s] for s in range(len(biblioList)) if biblioList[s]['date'][0] in [date]]
    # sort by occuring diagrams
    dfDATE = sorted(tempList,key=lambda tempList: tempList['Count'].sum(),reverse=True)
    plotListDATE = []  
    for x in range(len(dfDATE)):
        titleS = dfDATE[x]['biblio'][0] + '; ' + str(dfDATE[x]['date'][0]) + ' CE'
        b0 = Bar(dfDATE[x],title=titleS,label='biblio',values='Count',group='diaTyp',
             bar_width=1,ylabel='Diagrams',palette=viridis(14),width=250,height=250,
                legend=False)
        b0.xaxis.major_label_orientation = "horizontal"
        b0.xaxis.axis_label=''
        plotListDATE.append(b0)
    plotGrid = gridplot(plotListDATE,ncols=3)
    show(plotGrid)

In [None]:
plotDateGrid(900)

Next step is to compare diagram attributes by biblio index and diagram type.