In [149]:
import pymysql
import sqlalchemy
import pandas as pd

In [150]:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

with open('auth.txt', 'r') as f:
    keys = f.read().splitlines()
PWD, USR, DB = keys

SQLALCHEMY_DATABASE_URI = f"mysql+pymysql://{USR}:{PWD}@{DB}"

Session = sessionmaker()
engine = create_engine(SQLALCHEMY_DATABASE_URI)
Session.configure(bind=engine)

Base = declarative_base(engine)

In [151]:
from sqlalchemy import Column, Integer, String, Date, Numeric, desc

class VWPH(Base):
    __tablename__ = 'vw_ph'
    
    study_id = Column(Integer)
    site_id = Column(Integer)
    site_name = Column(String)
    ph_verb_start_year = Column(Integer)
    ph_verb_end_year = Column(Integer)
    ph_start_date = Column(Date)
    ph_end_date = Column(Date)
    ph_effective_date = Column(Date)
    ph_id = Column(Integer, primary_key=True)
    pulse_disturbance = Column(String)
    pulse_intensity = Column(String)
    land_use = Column(String)
    land_use_intensity = Column(String)
    source_habitat_description = Column(String)
    managed_for_biodiversity = Column(String)
    habitat_patch_area_unit = Column(String)
    habitat_patch_area_value = Column(Numeric)
    restoration_type = Column(String)
    ff1 = Column(String)
    ff2 = Column(String)
    ff3 = Column(String)
    crop = Column(String)
    organic = Column(String)
    aes = Column(String)
    fragmentation_layout = Column(String)


In [152]:
VWPH.__table__

Table('vw_ph', MetaData(bind=Engine(mysql+pymysql://sarav:***@LSCI-G78FG52:3306/predicts_2)), Column('study_id', Integer(), table=<vw_ph>), Column('site_id', Integer(), table=<vw_ph>), Column('site_name', String(), table=<vw_ph>), Column('ph_verb_start_year', Integer(), table=<vw_ph>), Column('ph_verb_end_year', Integer(), table=<vw_ph>), Column('ph_start_date', Date(), table=<vw_ph>), Column('ph_end_date', Date(), table=<vw_ph>), Column('ph_effective_date', Date(), table=<vw_ph>), Column('ph_id', Integer(), table=<vw_ph>, primary_key=True, nullable=False), Column('pulse_disturbance', String(), table=<vw_ph>), Column('pulse_intensity', String(), table=<vw_ph>), Column('land_use', String(), table=<vw_ph>), Column('land_use_intensity', String(), table=<vw_ph>), Column('source_habitat_description', String(), table=<vw_ph>), Column('managed_for_biodiversity', String(), table=<vw_ph>), Column('habitat_patch_area_unit', String(), table=<vw_ph>), Column('habitat_patch_area_value', Numeric(), 

In [153]:
session = Session()

# Pull back results from vw_ph 
all_results = session.query(VWPH).filter(VWPH.site_id < 15).order_by(VWPH.site_id) 

session.close()

---- data has been loaded from here - no need to re-run cells above ---

In [154]:

def roll_back(groups_2, variable):
    
    results = []

    for site, ph_period in groups_2.items():    
        
        # Grab a single site/multiple site records
        period_iter = iter(ph_period)

        # Initialise variables
        running = True
        start_elem = next(period_iter)

        start_ph_id = start_elem.ph_id # might be replaced if earlier period has same value

        start_date = start_elem.ph_start_date # might be replaced if earlier period has same value
        effective_date = start_elem.ph_effective_date # Hang onto this so we know when-ish the ph change even occurred, might be replaced etc...

        end_date = start_elem.ph_end_date # should stay the same, at least for this value of the variable
        end_ph_id = start_elem.ph_id # ditto

        while running:
            try:
                # Load the previous period
                next_elem = next(period_iter)

                # Previous pressure history period has the same variable value as the current ph period - keep going backwards
                if (getattr(next_elem, variable) == getattr(start_elem, variable)) or (getattr(start_elem, variable) is None and getattr(next_elem, variable) is not None):
                    start_elem = next_elem

                    start_date = start_elem.ph_start_date # Shift the start date back: pressure state covers > 1 pressure history period
                    effective_date = start_elem.ph_effective_date # ditto
                    start_ph_id = start_elem.ph_id # ditto

                # Previous period's ph variable has a different value than the current period, so we've traversed the current period 
                else:
                    # Stash the details of this ph/variable combo
                    results.append({'start_ph_id': start_ph_id, 'end_ph_id': end_ph_id, 'site_id': start_elem.site_id, 'variable_name': variable, 'variable_value': getattr(start_elem, variable), 'start_date': start_date,
                                   'end_date': end_date, 'effective_date': effective_date, 'previous_value': getattr(next_elem, variable)})

                    # Move onto the next period and reset the variables to reflect that this is a new start
                    start_elem = next_elem

                    start_date = start_elem.ph_start_date
                    end_date = start_elem.ph_end_date

                    effective_date = start_elem.ph_effective_date

                    start_ph_id = start_elem.ph_id
                    end_ph_id = start_elem.ph_id


            # Last ph in the list (represents first record chronologically)
            except StopIteration:
                running = False
                # Stash the results of the final period
                results.append({'start_ph_id': start_ph_id, 'end_ph_id': end_ph_id, 'site_id': start_elem.site_id, 'variable_name': variable, 'variable_value': getattr(start_elem, variable), 'start_date': start_date, 
                                'end_date': end_date, 'effective_date': effective_date, 'previous_value': 'Unknown'})

                
    return results



In [155]:
from itertools import groupby
groups = {}
uniquekeys = []

# Groups into a 2D list, aggregated by site_id
for k, g in groupby(all_results, lambda x: x.site_id):
    groups[k] = sorted(list(g), key=lambda x: x.ph_start_date, reverse=True) 
    uniquekeys.append(k)

In [156]:
# Pressure history states we need to track:
ph = ['land_use', 'land_use_intensity', 'source_habitat_description', 'managed_for_biodiversity', 'habitat_patch_area_unit', 'habitat_patch_area_value', 'restoration_type', 
      'ff1', 'ff2', 'ff3', 'crop', 'organic', 'aes', 'fragmentation_layout']

# Place to stash the results before writing to db
results = []

for variable in ph:
    results.extend(roll_back(groups, variable))
    

In [159]:
import pandas as pd

df = pd.DataFrame(results)

In [160]:
df.to_csv('test_all_ph.csv')