In [1]:
import os
import pandas as pd

pd.set_option('display.max_columns', 100)

In [2]:
current_dir = os.getcwd()
NOAA_dir = current_dir + r'\NOAA Weather Data'
EPA_dir = current_dir + r'\EPA Ozone Data'


In [24]:
def create_ozone_id(statelist, countylist, sitelist):
    """
    Helper function for creating ozoneID's based on an EPA dataset
    
    returns: 3 lists appended together into tuples to be added into a column
    """
    return [(a,b,c) for a, b, c in zip(statelist, countylist, sitelist)]

def append_ozone_id(ozone_df):
    """
    Creates a copy of the dataframe and adds a new column that concatenates the state code, county code, and site number
    into a tuple to make an individual identifier for each ozone reporting location in that year. 
    
    returns: dataframe with ozoneID
    """
    temp = ozone_df.copy() 
    temp["ozoneID"] = create_ozone_id(temp["state_code"], temp["county_code"], temp["site_id"])
    return temp

In [3]:
author_final = pd.read_stata(current_dir + "\Author Data\AER20090377_FinalData.dta")

In [25]:
author_final = append_ozone_id(author_final)

In [39]:
for year in range(1989, 2004):
    # sort by year because maybe designations change each year
    temp = author_final[author_final["year"] == year]
    # groupby 
    state_county_urban = temp.groupby("ozoneID")["urban"].unique()
    output = pd.DataFrame(state_county_urban)
    output.to_csv(current_dir + r"\Author Data\county_urban_designation\county_urban_designation_" + str(year) + ".csv")   

In [33]:
temp = author_final[author_final["year"] == 1989]
temp2 = temp.groupby("ozoneID")["urban"].unique()


In [37]:
test = pd.DataFrame(temp2)

In [38]:
test["urban"].iloc[1]

array([2.], dtype=float32)

In [6]:
countylist = pd.read_stata(current_dir + "\Author Data\AER20090377_CumulativeNOxInstallations.dta")

In [7]:
countylist

Unnamed: 0,Date,CumNOx
0,1989-01-01,0.0
1,1989-01-02,0.0
2,1989-01-03,0.0
3,1989-01-04,0.0
4,1989-01-05,0.0
...,...,...
5473,2003-12-27,242.0
5474,2003-12-28,242.0
5475,2003-12-29,242.0
5476,2003-12-30,242.0


In [8]:
author_final.head()

Unnamed: 0,state_code,county_code,site_id,valid,epa_8hr,ozone_max,day,month,year,Date,fips,state,county,partial,partialinfo,regtype,fedvssip,psi,sulfur,sulfurppm,rfgtype,noxeffect,RVPStart,RVPEnd,RFGStart,RFGEnd,RegFlag,RVPI,treat_rvpII,treat_rfg,treat_rvpI,treat_CARB,TreatRFG,panelid,RFGStart2,RFGEnd2,TreatRVPII,RVPStart2,RVPEnd2,TreatCARB,TreatRFGCA,RVPCty,RFGCty,CARBCty,TreatRVPca,_merge2,_merge3,SiteObs,TempMax,TempMin,EstTempFlag,NOtherStation,SiteObsprcp,Rain,Snow,EstTempFlagprcp,NumOffMax,NumOffMin,NumOff1Max,NumOff1Min,NOtherStationprcp,_merge,urban,_mergeurb
0,1,1,3,7.0,0.05425,0.068,1.0,9.0,1989.0,1989-09-01,1001.0,,,,,,,,,,,,NaT,NaT,NaT,NaT,0.0,10.5,0.0,0.0,1.0,0.0,0.0,1.0,,,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,345.0,96.0,74.0,0.0,,345.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,,3.0,3.0,3
1,1,1,3,3.0,0.0205,0.036,2.0,9.0,1989.0,1989-09-02,1001.0,,,,,,,,,,,,NaT,NaT,NaT,NaT,0.0,10.5,0.0,0.0,1.0,0.0,0.0,1.0,,,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,345.0,94.0,74.0,0.0,,345.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,3.0,3.0,3
2,1,1,3,12.0,0.070125,0.079,3.0,9.0,1989.0,1989-09-03,1001.0,,,,,,,,,,,,NaT,NaT,NaT,NaT,0.0,10.5,0.0,0.0,1.0,0.0,0.0,1.0,,,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,345.0,97.0,73.0,0.0,,345.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,3.0,3.0,3
3,1,1,3,12.0,0.053875,0.059,4.0,9.0,1989.0,1989-09-04,1001.0,,,,,,,,,,,,NaT,NaT,NaT,NaT,0.0,10.5,0.0,0.0,1.0,0.0,0.0,1.0,,,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,345.0,97.0,74.0,0.0,,345.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,3.0,3.0,3
4,1,1,3,12.0,0.059125,0.065,5.0,9.0,1989.0,1989-09-05,1001.0,,,,,,,,,,,,NaT,NaT,NaT,NaT,0.0,10.5,0.0,0.0,1.0,0.0,0.0,1.0,,,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,345.0,88.0,71.0,0.0,,345.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,3.0,3.0,3


In [10]:
author_final["RVPStart"].value_counts()

1994-05-01    25895
1993-06-01    25815
1994-06-01    25617
1992-05-01    24596
1993-05-01    24344
1992-06-01    24203
1995-06-01    21227
2004-06-01    17536
1997-06-01    17486
2005-06-01    17458
2002-06-01    17242
2001-06-01    17117
2000-06-01    17018
2003-06-01    16941
1998-06-01    16518
2006-06-01    16470
1995-05-01    16142
1999-06-01    16042
1996-06-01    15182
2004-05-01     4562
2005-05-01     4517
2006-05-01     4304
2003-05-01     4131
2002-05-01     3570
2001-05-01     3261
2000-05-01     3208
1999-05-01      957
1996-07-01      684
2003-06-05      306
Name: RVPStart, dtype: int64

In [11]:
author_final["RVPEnd"].value_counts()

1994-10-31    31546
1992-10-31    29985
1993-10-31    29609
1995-10-31    21793
1993-09-15    19748
1994-09-15    19064
2004-09-15    18730
2005-09-15    18604
1992-09-15    18308
2003-09-15    18016
2002-09-15    17893
2000-09-15    17671
2006-09-15    17641
2001-09-15    17618
1999-09-15    16693
1998-09-15    16518
1996-09-15    15866
1997-09-15    15076
1995-09-15    14924
2006-10-01     3133
2005-10-01     3065
2004-10-01     3062
2003-10-01     3056
2002-10-01     2613
2001-10-01     2464
1997-08-03     2410
2000-10-01     2249
1994-09-30      902
1993-09-30      802
1995-09-30      652
1992-09-30      506
2000-12-31      306
2004-12-30      306
2005-12-27      306
2003-12-31      306
1999-12-31      306
2002-12-31      306
2001-12-30      296
Name: RVPEnd, dtype: int64