In [1]:
import gzip            
import io              
import numpy as np
import pandas as pd    
import pymysql.cursors 
import rdflib
from rdflib import Namespace
import urllib.request 
import math

import matplotlib.pyplot as plt
plt.style.use('ggplot')

%matplotlib inline

# Connect to the database

connection = pymysql.connect(host='hosting.nyu.edu',
                             user='cmrougha_adsq',
                             password='###REPLACE###',
                             db='cmrougha_adsq2017',
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)

# Date

In [84]:
sql = """
SELECT astrID_wi, DateLit, Siecle, Date
FROM `17.4.12_pinAstr_all`
WHERE ( Date IS NOT NULL )
AND ( Date != "" )
ORDER BY Date DESC
"""

with connection.cursor() as cursor:

    cursor.execute(sql)
    names = [ x[0] for x in cursor.description]
    result = cursor.fetchall()

df = pd.DataFrame(result, columns = names)
print(len(df))
df.head()

4727


Unnamed: 0,astrID_wi,DateLit,Siecle,Date
0,9331503,,16,Fragmentum
1,9593102,,10,925
2,8490393,,19,1898
3,5911876,,19,1868
4,7772967,,19,1855


## Date: Simplest Case (925)

In [82]:
sql = """
SELECT astrID_wi, DateLit, Siecle, Date, CAST(Date AS DECIMAL(4,0)) AS hasExactDate
FROM `17.4.12_pinAstr_all`
WHERE ( Date IS NOT NULL )
AND ( Date != "" )
AND ( Date NOT LIKE '%-%' )
AND ( Date NOT LIKE '% %' )
AND ( Date NOT LIKE '%.%' )
AND ( Date NOT LIKE '%,%' )
AND ( Date NOT LIKE '%?%' )
AND ( Date NOT LIKE '%/%' )
AND ( Date NOT LIKE '%F%' )
ORDER BY Date DESC
"""

with connection.cursor() as cursor:

    cursor.execute(sql)
    names = [ x[0] for x in cursor.description]
    result = cursor.fetchall()

df = pd.DataFrame(result, columns = names)
print(len(df))
df.head()

2413


Unnamed: 0,astrID_wi,DateLit,Siecle,Date,hasExactDate
0,9593102,,10,925,925
1,8490393,,19,1898,1898
2,5911876,,19,1868,1868
3,7772967,,19,1855,1855
4,9792452,,19,1855,1855


## Date: Simple Range (1849-1853)

In [81]:
sql = """
SELECT astrID_wi, DateLit, Siecle, Date, CAST(SUBSTRING_INDEX(Date,'-',1) AS DECIMAL(4,0)) AS hasExactDateStart, CAST(SUBSTRING_INDEX(Date,'-',-1) AS DECIMAL(4,0)) AS hasExactDateEnd
FROM `17.4.12_pinAstr_all`
WHERE ( Date IS NOT NULL )
AND ( Date != "" )
AND ( Date LIKE '%-%' )
AND ( Date NOT LIKE '% %' )
AND ( Date NOT LIKE '%.%' )
AND ( Date NOT LIKE '%,%' )
AND ( Date NOT LIKE '%?%' )
AND ( Date NOT LIKE '%/%' )
AND ( Date NOT LIKE '%F%' )
ORDER BY Date DESC
"""

with connection.cursor() as cursor:

    cursor.execute(sql)
    names = [ x[0] for x in cursor.description]
    result = cursor.fetchall()

df = pd.DataFrame(result, columns = names)
print(len(df))
df.head()

1145


Unnamed: 0,astrID_wi,DateLit,Siecle,Date,hasExactDateStart,hasExactDateEnd
0,4324702,,19,1849-1853,1849,1853
1,4407381,,19,1849-1853,1849,1853
2,2397495,,19,1849-1853,1849,1853
3,3752645,,19,1849-1853,1849,1853
4,4753180,,19,1849-1853,1849,1853


In [80]:
sql = """
SELECT astrID_wi, DateLit, Siecle, Date, CAST(SUBSTRING_INDEX(Date,'/',1) AS DECIMAL(4,0)) AS hasExactDateStart, CAST(SUBSTRING_INDEX(Date,'/',-1) AS DECIMAL(4,0)) AS hasExactDateEnd
FROM `17.4.12_pinAstr_all`
WHERE ( Date IS NOT NULL )
AND ( Date != "" )
AND ( Date NOT LIKE '%-%' )
AND ( Date NOT LIKE '% %' )
AND ( Date NOT LIKE '%.%' )
AND ( Date NOT LIKE '%,%' )
AND ( Date NOT LIKE '%?%' )
AND ( Date LIKE '%/%' )
AND ( Date NOT LIKE '%F%' )
ORDER BY Date DESC
"""

with connection.cursor() as cursor:

    cursor.execute(sql)
    names = [ x[0] for x in cursor.description]
    result = cursor.fetchall()

df = pd.DataFrame(result, columns = names)
print(len(df))
df.head()

8


Unnamed: 0,astrID_wi,DateLit,Siecle,Date,hasExactDateStart,hasExactDateEnd
0,4615440,,15,1450/1451,1450,1451
1,3472043,,15,1450/1451,1450,1451
2,3263283,,15,1450/1451,1450,1451
3,7016513,,15,1450/1451,1450,1451
4,1341861,,15,1450/1451,1450,1451


In [79]:
sql = """
SELECT astrID_wi, DateLit, Siecle, Date, CAST(SUBSTRING_INDEX(Date,', ',1) AS DECIMAL(4,0)) AS hasExactDateStart, CAST(SUBSTRING_INDEX(Date,', ',-1) AS DECIMAL(4,0)) AS hasExactDateEnd
FROM `17.4.12_pinAstr_all`
WHERE ( Date IS NOT NULL )
AND ( Date != "" )
AND ( Date LIKE '%, %' )
AND ( Date NOT LIKE '%-%' )
AND ( Date NOT LIKE '%.%' )
AND ( Date NOT LIKE '%?%' )
AND ( Date NOT LIKE '%/%' )
AND ( Date NOT LIKE '%F%' )
ORDER BY Date DESC
"""

with connection.cursor() as cursor:

    cursor.execute(sql)
    names = [ x[0] for x in cursor.description]
    result = cursor.fetchall()

df = pd.DataFrame(result, columns = names)
print(len(df))
df.head()

11


Unnamed: 0,astrID_wi,DateLit,Siecle,Date,hasExactDateStart,hasExactDateEnd
0,6740562,,18,"1739, 1750",1739,1750
1,2805067,,18,"1739, 1750",1739,1750
2,5190777,,18,"1739, 1750",1739,1750
3,5955700,,18,"1739, 1750",1739,1750
4,3769430,,18,"1739, 1750",1739,1750


## Date: Simple Approximation (1845 ca OR 1785 ca. OR 1500 circa)

In [78]:
sql = """
SELECT astrID_wi, DateLit, Siecle, Date, CAST(SUBSTRING_INDEX(Date,' ',1) AS DECIMAL(4,0)) AS hasApproxDate
FROM `17.4.12_pinAstr_all`
WHERE ( Date IS NOT NULL )
AND ( Date != "" )
AND ( Date LIKE '% ca%' OR Date LIKE '%circa%' )
AND ( Date NOT LIKE '%-%' )
AND ( Date NOT LIKE '%,%' )
AND ( Date NOT LIKE '%?%' )
AND ( Date NOT LIKE '%/%' )
AND ( Date NOT LIKE '%F%' )
ORDER BY Date DESC
"""

with connection.cursor() as cursor:

    cursor.execute(sql)
    names = [ x[0] for x in cursor.description]
    result = cursor.fetchall()

df = pd.DataFrame(result, columns = names)
print(len(df))
df.head()

790


Unnamed: 0,astrID_wi,DateLit,Siecle,Date,hasApproxDate
0,7128037,,19,1845 ca,1845
1,2141526,,18,1785 ca.,1785
2,8312227,,18,1785 ca.,1785
3,2945572,,18,1785 ca.,1785
4,2611836,,18,1785 ca.,1785


In [77]:
sql = """
SELECT astrID_wi, DateLit, Siecle, Date, CAST(SUBSTRING_INDEX(SUBSTRING_INDEX(Date,' ',1),'-',1) AS DECIMAL(4,0)) AS hasApproxStartDate, CAST(SUBSTRING_INDEX(SUBSTRING_INDEX(Date,' ',1),'-',-1) AS DECIMAL(4,0)) AS hasApproxEndDate
FROM `17.4.12_pinAstr_all`
WHERE ( Date IS NOT NULL )
AND ( Date != "" )
AND ( Date LIKE '% ca%' OR Date LIKE '%circa%' )
AND ( Date LIKE '%-%' )
AND ( Date NOT LIKE '%,%' )
AND ( Date NOT LIKE '%?%' )
AND ( Date NOT LIKE '%/%' )
AND ( Date NOT LIKE '%F%' )
ORDER BY Date DESC
"""

with connection.cursor() as cursor:

    cursor.execute(sql)
    names = [ x[0] for x in cursor.description]
    result = cursor.fetchall()

df = pd.DataFrame(result, columns = names)
print(len(df))
df.head()

117


Unnamed: 0,astrID_wi,DateLit,Siecle,Date,hasApproxStartDate,hasApproxEndDate
0,9037184,,18 ex.,1788-1796 ca.,1788,1796
1,1535068,,18 ex.,1788-1796 ca.,1788,1796
2,7358047,,18 ex.,1788-1796 ca.,1788,1796
3,3468881,,18 ex.,1788-1796 ca.,1788,1796
4,8002765,,18 ex.,1788-1796 ca.,1788,1796


In [76]:
sql = """
SELECT astrID_wi, DateLit, Siecle, Date, CAST(SUBSTRING_INDEX(SUBSTRING_INDEX(Date,' ',1),'-',1) AS DECIMAL(4,0)) AS hasUncertainStartDate, CAST(SUBSTRING_INDEX(SUBSTRING_INDEX(Date,' ',1),'-',-1) AS DECIMAL(4,0)) AS hasUncertainEndDate
FROM `17.4.12_pinAstr_all`
WHERE ( Date IS NOT NULL )
AND ( Date != "" )
AND ( Date LIKE '%-%' )
AND ( Date LIKE '%?%' )
AND ( Date NOT LIKE '%c%' )
AND ( Date NOT LIKE '%,%' )
AND ( Date NOT LIKE '%/%' )
AND ( Date NOT LIKE '%F%' )
ORDER BY Date DESC
"""

with connection.cursor() as cursor:

    cursor.execute(sql)
    names = [ x[0] for x in cursor.description]
    result = cursor.fetchall()

df = pd.DataFrame(result, columns = names)
print(len(df))
df.head()

1


Unnamed: 0,astrID_wi,DateLit,Siecle,Date,hasUncertainStartDate,hasUncertainEndDate
0,9097714,,18 in.,1707-1709 ?,1707,1709


In [75]:
sql = """
SELECT astrID_wi, DateLit, Siecle, Date, CAST(SUBSTRING_INDEX(Date,' ',1) AS DECIMAL(4,0)) AS hasUncertainDate
FROM `17.4.12_pinAstr_all`
WHERE ( Date IS NOT NULL )
AND ( Date != "" )
AND ( Date NOT LIKE '% c%' )
AND ( Date NOT LIKE '%-%' )
AND ( Date NOT LIKE '%,%' )
AND ( Date LIKE '%?%' )
AND ( Date NOT LIKE '%/%' )
AND ( Date NOT LIKE '%F%' )
ORDER BY Date DESC
"""

with connection.cursor() as cursor:

    cursor.execute(sql)
    names = [ x[0] for x in cursor.description]
    result = cursor.fetchall()

df = pd.DataFrame(result, columns = names)
print(len(df))
df.head()

84


Unnamed: 0,astrID_wi,DateLit,Siecle,Date,hasUncertainDate
0,7870989,,16,1578 ?,1578
1,2705167,,16,1560 ?,1560
2,1537801,,15,1489?,1489
3,6312752,,15,1489?,1489
4,2400250,,15,1489?,1489


In [74]:
sql = """
SELECT astrID_wi, DateLit, Siecle, Date, CAST(SUBSTRING_INDEX(Date,' ',1) AS DECIMAL(4,0)) AS hasExactDateEnd
FROM `17.4.12_pinAstr_all`
WHERE ( Date IS NOT NULL )
AND ( Date != "" )
AND ( Date LIKE "%ante%" )
AND ( Date NOT LIKE '%-%' )
AND ( Date NOT LIKE '%.%' )
AND ( Date NOT LIKE '%,%' )
AND ( Date NOT LIKE '%?%' )
AND ( Date NOT LIKE '%/%' )
AND ( Date NOT LIKE '%F%' )
ORDER BY Date DESC
"""

with connection.cursor() as cursor:

    cursor.execute(sql)
    names = [ x[0] for x in cursor.description]
    result = cursor.fetchall()

df = pd.DataFrame(result, columns = names)
print(len(df))
df.head()

130


Unnamed: 0,astrID_wi,DateLit,Siecle,Date,hasExactDateEnd
0,2154097,,17,1691 ante,1691
1,4361775,,17,1691 ante,1691
2,1078478,,17,1691 ante,1691
3,8677738,,17,1691 ante,1691
4,5920779,,17,1691 ante,1691


In [73]:
sql = """
SELECT astrID_wi, DateLit, Siecle, Date, CAST(SUBSTRING_INDEX(Date,' ',1) AS DECIMAL(4,0)) AS hasExactDateStart
FROM `17.4.12_pinAstr_all`
WHERE ( Date IS NOT NULL )
AND ( Date != "" )
AND ( Date LIKE "%post%" )
AND ( Date NOT LIKE '%-%' )
AND ( Date NOT LIKE '%.%' )
AND ( Date NOT LIKE '%,%' )
AND ( Date NOT LIKE '%?%' )
AND ( Date NOT LIKE '%/%' )
AND ( Date NOT LIKE '%F%' )
ORDER BY Date DESC
"""

with connection.cursor() as cursor:

    cursor.execute(sql)
    names = [ x[0] for x in cursor.description]
    result = cursor.fetchall()

df = pd.DataFrame(result, columns = names)
print(len(df))
df.head()

24


Unnamed: 0,astrID_wi,DateLit,Siecle,Date,hasExactDateStart
0,4332905,,16,1502 post,1502
1,6382173,,15 med.,1453 post,1453
2,1089753,,15,1453 post,1453
3,3001967,,15 med.,1453 post,1453
4,7809334,,15 med.,1453 post,1453


In [72]:
sql = """
SELECT astrID_wi, DateLit, Siecle, Date, CAST(SUBSTRING_INDEX(SUBSTRING_INDEX(Date,' ',1),'/',-1) AS DECIMAL(4,0)) AS hasExactDateEnd
FROM `17.4.12_pinAstr_all`
WHERE ( Date IS NOT NULL )
AND ( Date != "" )
AND ( Date LIKE "%ante%" )
AND ( Date NOT LIKE '%-%' )
AND ( Date NOT LIKE '%.%' )
AND ( Date NOT LIKE '%,%' )
AND ( Date NOT LIKE '%?%' )
AND ( Date LIKE '%/%' )
AND ( Date NOT LIKE '%F%' )
ORDER BY Date DESC
"""

with connection.cursor() as cursor:

    cursor.execute(sql)
    names = [ x[0] for x in cursor.description]
    result = cursor.fetchall()

df = pd.DataFrame(result, columns = names)
print(len(df))
df.head()

1


Unnamed: 0,astrID_wi,DateLit,Siecle,Date,hasExactDateEnd
0,2342459,,15,1488/1489 ante,1489


In [71]:
# There is no instance of eg 1488/1489 post in the table
sql = """
SELECT astrID_wi, DateLit, Siecle, Date, CAST(SUBSTRING_INDEX(SUBSTRING_INDEX(Date,' ',1),'/',1) AS DECIMAL(4,0)) AS hasExactDateStart
FROM `17.4.12_pinAstr_all`
WHERE ( Date IS NOT NULL )
AND ( Date != "" )
AND ( Date LIKE "%post%" )
AND ( Date NOT LIKE '%-%' )
AND ( Date NOT LIKE '%.%' )
AND ( Date NOT LIKE '%,%' )
AND ( Date NOT LIKE '%?%' )
AND ( Date LIKE '%/%' )
AND ( Date NOT LIKE '%F%' )
ORDER BY Date DESC
"""

with connection.cursor() as cursor:

    cursor.execute(sql)
    names = [ x[0] for x in cursor.description]
    result = cursor.fetchall()

df = pd.DataFrame(result, columns = names)
print(len(df))
df.head()

PandasError: DataFrame constructor not properly called!

In [70]:
sql = """
SELECT astrID_wi, DateLit, Siecle, Date, Date AS hasExactDate
FROM `17.4.12_pinAstr_all`
WHERE ( Date IS NOT NULL )
AND ( Date != "" )
AND ( Date NOT LIKE '%-%' )
AND ( Date NOT LIKE '% %' )
AND ( Date LIKE '%.%' )
AND ( Date NOT LIKE '%,%' )
AND ( Date NOT LIKE '%?%' )
AND ( Date NOT LIKE '%/%' )
AND ( Date NOT LIKE '%F%' )
ORDER BY Date DESC
"""

with connection.cursor() as cursor:

    cursor.execute(sql)
    names = [ x[0] for x in cursor.description]
    result = cursor.fetchall()

df = pd.DataFrame(result, columns = names)
print(len(df))
df.head()

2


Unnamed: 0,astrID_wi,DateLit,Siecle,Date,hasExactDate
0,8385610,,14,1388.10.26,1388.10.26
1,8385610,,14,1383.4.1,1383.4.1


# Siecle

In [88]:
sql = """
SELECT astrID_wi, DateLit, Siecle, Date
FROM `17.4.12_pinAstr_all`
WHERE ( Date IS NULL OR Date = "" )
AND ( Siecle != "" )
ORDER BY Date DESC
"""

with connection.cursor() as cursor:

    cursor.execute(sql)
    names = [ x[0] for x in cursor.description]
    result = cursor.fetchall()

df = pd.DataFrame(result, columns = names)
print(len(df))
df.head()

31354


Unnamed: 0,astrID_wi,DateLit,Siecle,Date
0,7238870,,18,
1,1960860,,18,
2,5344980,,19,
3,6661882,,19,
4,7662980,,19,


In [94]:
sql = """
SELECT astrID_wi, DateLit, Siecle, Date,
 CAST(CONCAT(Siecle,"00") AS DECIMAL(4,0)) - 100 AS hasApproxDateStart,
 CAST(CONCAT(Siecle,"00") AS DECIMAL(4,0)) - 1 AS hasApproxDateEnd
FROM `17.4.12_pinAstr_all`
WHERE ( Date IS NULL OR Date = "" )
AND ( Siecle != "" )
AND ( Siecle NOT LIKE "% %" )
AND ( Siecle NOT LIKE "%-%" )
AND ( Siecle NOT LIKE "%/%" )
AND ( Siecle NOT LIKE "%,%" )
AND ( Siecle NOT LIKE "%.%" )
AND ( Siecle NOT LIKE "%?%" )
AND ( Siecle NOT LIKE "%(%" )
AND ( Siecle NOT LIKE "%)%" )
ORDER BY Date DESC
"""

with connection.cursor() as cursor:

    cursor.execute(sql)
    names = [ x[0] for x in cursor.description]
    result = cursor.fetchall()

df = pd.DataFrame(result, columns = names)
print(len(df))
df.head()

19612


  self._do_get_result()


Unnamed: 0,astrID_wi,DateLit,Siecle,Date,hasApproxDateStart,hasApproxDateEnd
0,7238870,,18,,1700,1799
1,1960860,,18,,1700,1799
2,5344980,,19,,1800,1899
3,6661882,,19,,1800,1899
4,7662980,,19,,1800,1899
