In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np
import statsmodels.formula.api as smf
%matplotlib inline

In [None]:
df = pd.read_csv('data/BSB-HS.csv')

In [None]:
df.head()

In [None]:
def get_year(x):
    
    if pd.isnull(x):
        return None
    
    pattern0 = r'([0-9]+)\./([0-9]+)\.\sJh\.'
    if re.search(pattern0, x) != None:
        tot = int(re.search(pattern0, x).groups()[0] + '00') + int(re.search(pattern0, x).groups()[1] + '00')
        return tot/2

    pattern3 = r'1\.\sHälfte\s([0-9]+)\.\sJh\.'
    if re.search(pattern3, x) != None:
        return int(re.search(pattern3, x).groups()[0] + '00') - 75
    
    pattern4 = r'2\.\sHälfte\s([0-9]+)\.\sJh\.'
    if re.search(pattern4, x) != None:
        return int(re.search(pattern4, x).groups()[0] + '00') - 25
    
    pattern5 = r'Mitte\s([0-9]+)\.\sJh\.'
    if re.search(pattern5, x) != None:
        return int(re.search(pattern5, x).groups()[0] + '00') - 50

    pattern6 = r'Ende\s([0-9]+)\.\sJh\.'
    if re.search(pattern6, x) != None:
        return int(re.search(pattern6, x).groups()[0] + '00') - 10
    
    pattern7 = r'1\.\sViertel\s([0-9]+)\.\sJh\.'
    if re.search(pattern7, x) != None:
        return int(re.search(pattern7, x).groups()[0] + '00') - 80
    
    pattern8 = r'2\.\sViertel\s([0-9]+)\.\sJh\.'
    if re.search(pattern8, x) != None:
        return int(re.search(pattern8, x).groups()[0] + '00') - 60
    
    pattern9 = r'3\.\sViertel\s([0-9]+)\.\sJh\.'
    if re.search(pattern9, x) != None:
        return int(re.search(pattern9, x).groups()[0] + '00') - 40
    
    pattern10 = r'4\.\sViertel\s([0-9]+)\.\sJh\.'
    if re.search(pattern10, x) != None:
        return int(re.search(pattern10, x).groups()[0] + '00') - 20
    
    pattern1 = r'([0-9]+)\.\sJh\.'
    if re.search(pattern1, x) != None:
        return int(re.search(pattern1, x).groups()[0] + '00') - 100
    
    pattern2 = r'([0-9]{4})'
    if re.search(pattern2, x) != None:
        return int(re.search(pattern2, x).groups()[0])
    
    else:
        return None
    
def get_extent(x):
    
    if pd.isnull(x):
        return None
    
    pattern = r'([0-9]+)\sBl\.'
    if re.search(pattern, x) != None:
        return int(re.search(pattern, x).groups()[0])
    
    else:
        return None
    
    
def fix_lang(x):
    if type(x) == str and len(x) > 0:
        return x
    else:
        return None

In [None]:
df['Year'] = df['Creation'].apply(get_year)
df['Pages'] = df['Extent'].apply(get_extent)
df['Area'] = df['Height'] * df['Width']
df['Language'] = df['Language'].apply(fix_lang)

In [None]:
df.head()

In [None]:
df[['Language', 'Pages']].dropna().groupby(['Language'])['Pages'].mean()

In [None]:
df[['Language', 'Area']].dropna().groupby(['Language'])['Area'].mean()

In [None]:
df[['Language', 'Pages']].dropna().boxplot(column='Pages', by='Language', rot=90, figsize=(15,10))

In [None]:
df[['Language', 'Height']].dropna().boxplot(column='Height', by='Language',rot=90, figsize=(15,10))

In [None]:
df[['Language', 'Width']].dropna().boxplot(column='Width', by='Language', rot=90, figsize=(15,10))

In [None]:
df[['Language', 'Area']].dropna().boxplot(column='Area', by='Language', rot=90, figsize=(15,10))

In [None]:
from scipy.stats import ttest_ind

cat1 = df[df['Language']=='German'][['Language', 'Pages']].dropna()
cat2 = df[df['Language']=='Latin'][['Language', 'Pages']].dropna()

ttest_ind(cat1['Pages'], cat2['Pages'])

In [None]:
df.hist('Year')

In [None]:
df.plot.scatter('Height', 'Width')

In [None]:
mod = smf.ols(formula='Width ~ Height', data=df)
res = mod.fit()
print(res.summary())

In [None]:
df.plot.scatter('Year', 'Pages')

In [None]:
mod = smf.ols(formula='Pages ~ Year', data=df)
res = mod.fit()
print(res.summary())

In [None]:
df.plot.scatter('Year', 'Area')

In [None]:
df.corr()

In [None]:
bins = list(range(700, 1601, 50))

In [None]:
df['Year_Bins'] = pd.cut(df['Year'], bins)

In [None]:
df.groupby(['Year_Bins'])['Pages'].mean().plot.barh()