Import relevant packages

In [1]:
import glob
import pandas as pd
import tabula
from urllib.request import urlretrieve

Define functions for wrangling raw data into processable format

In [2]:
def add_year_column(tableidx):  
  table = pd.DataFrame(TABLES[landuse_tables[tableidx]])
  table['Year'] = table.iloc[:,0].str.strip('Estimate')
  estimates = table[table.index % 2 == 0]
  return estimates

def pdf_df(dataframe_list):  
  dataframe_df = dataframe_list[2]
  dataframe_df.iloc[1,0] = 'Year'
  dataframe_acres = dataframe_df.iloc[1:3,:].T
  dataframe_acres2 = dataframe_acres.iloc[1:]
  dataframe_acres2.columns = dataframe_acres.iloc[0,:]
  dataframe_acres2['Year'] = dataframe_acres2['Year'].astype('int')
  dataframe_acres2['Corn'] = [float(str(i).replace(",", "")) for i in dataframe_acres2['Corn']]
  dataframe_acres2['Corn (Thousands of acres)'] = dataframe_acres2['Corn'] /1000
  dataframe_acres2.reset_index(drop=True, inplace=True)
  return dataframe_acres2


def relevant_series_selection(series1, series2):
  seriesindex1 = series1
  seriesindex2 = series2
  relevant_table = estimates.iloc[:,[seriesindex1,seriesindex2]]
  series1_name = estimates.iloc[:,seriesindex1].name
  series2_name = estimates.iloc[:,seriesindex2].name
  relevant_table.reset_index(drop=True,inplace=True)
  relevant_table[series1_name] = relevant_table[series1_name].astype('float')
  relevant_table[series2_name] = relevant_table[series2_name].astype('int')
  relevant_table = relevant_table[[series2_name,series1_name]]

  return relevant_table

def observed_change_timeperiod(dataframe):
  differences = []
  years = []
  yearly_diff = []
  for row in dataframe.itertuples():
    if row[0] == min(range(len(dataframe))):
      prior_row = row
      continue
    else:
      differences.append(row[2] - prior_row[2])
      years.append(int(row[1]) - int(prior_row[1]))
      prior_row = row

  for diff,year in zip(differences,years):
    for yr in range(year-1):
      yearly_diff.append(diff/year)
  return yearly_diff

def create_missing_df(dataframe, series1, series2):
  miss_years = []
  for start in series1:
    for year in range(len(series1)):
      if (start + year) == start:
        continue
      elif (start + year) in list(series1) or (start + year) > max(series1):
        break
      else:
        miss_years.append(start + year)

  missing_df = pd.DataFrame()
  missing_df[series1.name] = miss_years
  missing_df[series2.name] = 0
  missing_df['Differences'] = observed_change_timeperiod(dataframe)
  return missing_df

def create_missing_practices(dataframe, series1, series2):
  miss_years = []
  for start in series1:
    for year in range(len(series1 < 2008)):
      if (start + year) == start:
        continue
      elif (start + year) in list(series1) or (start + year) > max(series1):
        break
      else:
        miss_years.append(start + year)

  missing_df = pd.DataFrame()
  missing_df[series1.name] = miss_years
  missing_df[series2.name] = 0
  missing_df['Differences'] = observed_change_timeperiod(dataframe)
  return missing_df

def parse_table(dataframe,miss_df): 
  dataframe['Differences'] = 0
  relevant_table_parsed = dataframe.append(miss_df)
  relevant_table_parsed.sort_values(by='Year',inplace=True)
  relevant_table_parsed.reset_index(drop=True,inplace=True)
  return relevant_table_parsed

def calc_total_land_change(dataframe):
  TotalLand = []
  for row in dataframe.itertuples():
    if row[3] == 0:
      TotalLand.append(row[2])
      prior_row = row[2]
    else:
      TotalLand.append((prior_row + row[3]))
      prior_row = prior_row + row[3]
  
  return TotalLand

def fert_df_creation(df, fert_type):
    IN_fert = df.iloc[[1,12],:]
    IN_fert = IN_fert.T
    IN_fert.iloc[0,0] = 'Year'
    IN_fert.iloc[0,1] = fert_type
    IN_fert.columns = IN_fert.iloc[0,:]
    IN_fert = IN_fert.iloc[1:]
    IN_fert.reset_index(drop=True,inplace=True)
    return IN_fert

def table_split(tilage_type):
    tillage = tilage_type
    data_filter = emissions['Tillage'] == tillage
    filtered_data = emissions[data_filter]
    filtered_df = filtered_data['Total']
    filtered_df.reset_index(drop=True, inplace=True)
    return filtered_df

Retrieve landuse data from USDA website

In [3]:
url = 'https://www.nrcs.usda.gov/Internet/NRCS_RCA/csv/nri_download_in.csv'

In [4]:
urlretrieve(url,'IN_landuse.csv')

('IN_landuse.csv', <http.client.HTTPMessage at 0x7fc128412040>)

Iterate through each line of the USDA file and assign it to the tableslist

In [5]:
fileList = []
table = []
with open('IN_landuse.csv', 'rb') as file:
  for line in file:
    if len(line) < 6:
      if table != []:
        fileList.append(table)
        table = []
      continue
    else:
      table.append(line)

In [6]:
tableslist = fileList[1:23]

Create a list of tables as dicts from the tables in tableslist and strip any formating charcters from the string of the table name, from table_list_dict and table_names lists create accessable dictionary TABLES where the keys are the names of the tables and the values are dictionaries containing all of the relevant landuse data. 

In [7]:
table_names = []
for table in tableslist:
  if tableslist.index(table) == 0:
    table_names.append(str(table[1]).strip("b'").strip('\\r\\n'))
  elif (tableslist.index(table))%2 == 0:
    table_names.append(str(table[0]).strip("b'").strip('\\r\\n'))

In [8]:
table_raw = [table for table in tableslist if tableslist.index(table)%2 != 0]

In [9]:
list_table_dict = []
for table in table_raw:
  table_dict = {}
  for row in table:
    listrow = str(row).split(',')
    striprow = []
    for element in listrow:
      striprow.append(element.strip("b'").strip("\\r\\n"))
    column_name = striprow[0]
    if striprow[-1] == '':
      values = striprow[1:-1]
    else:
      values = striprow[1:]
    table_dict[column_name] = values
  list_table_dict.append(table_dict)


In [10]:
TABLES = {}
for name, table in zip(table_names, list_table_dict):
  TABLES[name] = table

In [11]:
landuse_tables = [key for key in TABLES.keys()]

In [12]:
landuse_tables

['Total Surface Area by Land Cover/Use',
 'Non-Federal Rural Land by Land Cover/Use',
 'Prime Farmland by Land Cover/Use',
 'Indiana Cropland Use',
 'Indiana  Erosio',
 'Indiana  Non-Federal Grazing Land',
 'Indiana  Non-Federal Forest Land',
 'Indiana  Developed Land',
 'Sources of Developed Land',
 'Indiana  Wetlands',
 'Palustrine and Estuarine Wetlands on Water Areas and Non-Federal Land by Land Cover/Use']

Call the table at index position 7, which contains IN landuse by development level, in the add_year_column function to strip only the integer of the year from the index.

In [13]:
estimates = add_year_column(7)
estimates.columns

Index(['Developed Land Type', 'Large Urban and Built-Up Areas ',
       'Small Built-Up Areas ', 'Rural Transportation ',
       'Total Developed Land ', 'Year'],
      dtype='object')

Create relevant table from the appropriate series of landuse data, index positions 4 and 5.

In [14]:
relevant_table = relevant_series_selection(4,5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_table[series1_name] = relevant_table[series1_name].astype('float')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_table[series2_name] = relevant_table[series2_name].astype('int')


Create a new dataframe (Dev_Land_Final), by calling the create_missing_df and parse_table functions on the previously defined relevant table

In [15]:
missing_df = create_missing_df(relevant_table,relevant_table.iloc[:,0],relevant_table.iloc[:,1])

In [16]:
relevant_table_parsed = parse_table(relevant_table,missing_df)

In [17]:
relevant_table_parsed['Total Dev Land (Thousands of acres)'] = calc_total_land_change(relevant_table_parsed)
Final_table = relevant_table_parsed[['Year','Total Dev Land (Thousands of acres)']]
Dev_Land_Final = Final_table

Repeat the above steps on the table in the TABLES dictionary at index position 1

In [18]:
estimates = add_year_column(1)
estimates.columns

Index(['Land Cover/Use', 'Cropland ', 'CRP Land ', 'Pastureland ',
       'Rangeland ', 'Forest Land ', 'Other Rural Land ',
       'Total Non-Federal Rural Land ', 'Year'],
      dtype='object')

Create relevant_table dataframe from the series at positions 1 and 8. And call the create_missing_df and parse_table functions on it to produce Crop_Land_Final table.

In [19]:
relevant_table = relevant_series_selection(1,8)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_table[series1_name] = relevant_table[series1_name].astype('float')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_table[series2_name] = relevant_table[series2_name].astype('int')


In [20]:
missing_df = create_missing_df(relevant_table,relevant_table.iloc[:,0],relevant_table.iloc[:,1])

In [21]:
relevant_table_parsed = parse_table(relevant_table,missing_df)

In [22]:
relevant_table_parsed['Total Crop Land (Thousands of acres)'] = calc_total_land_change(relevant_table_parsed)
Final_table = relevant_table_parsed[['Year','Total Crop Land (Thousands of acres)']]
Crop_Land_Final = Final_table

Create the final_landuse table by merging the Dev_Land_Final dataframe to the Crop_Land_Final dataframe on year.

In [23]:
final_landuse = Crop_Land_Final.merge(Dev_Land_Final)

Creat the IN_fert_final dataframe from the historical fertilzer data for IN by selecting the data from the nitrogen, phosphate, potash fertilizer use for corn sheets in the fertilizeruse workbook merging on on year.

In [24]:
fert_path = '/Users/coleromanyk/Documents/GitHub/Capstone-2/Data/fertilizeruse.xls'

In [25]:
corn_tables = ['Table10','Table12','Table14']

In [26]:
raw_nitrogen = pd.read_excel(fert_path,sheet_name=corn_tables[0])

In [27]:
IN_N = fert_df_creation(raw_nitrogen, 'Nitrogen')

In [28]:
raw_phosphate = pd.read_excel(fert_path,sheet_name=corn_tables[1])

In [29]:
IN_PH = fert_df_creation(raw_phosphate, 'Phosphate')

In [30]:
raw_potash = pd.read_excel(fert_path,sheet_name=corn_tables[2])

In [31]:
IN_PO = fert_df_creation(raw_potash, 'Potash')

In [32]:
fert1 = IN_N.merge(IN_PH)
IN_fert_final = fert1.merge(IN_PO)

In [33]:
IN_fert_final.fillna(round(IN_fert_final.rolling(10,min_periods=1).mean(),),inplace=True)

Combine the fertilizer use data for IN with the landuse data merging on year.

In [34]:
landuse_fertilizer = final_landuse.merge(IN_fert_final)

Download and read the appropriate production practice data from the in.gov site.

In [35]:
pdf = 'https://www.in.gov/isda/files/No-Till-Trends-1990-2019-Statewide.pdf'
urlretrieve(pdf,'no_till.pdf')
no_till = tabula.read_pdf('no_till.pdf', lattice=True, multiple_tables=True)

In [36]:
pdf = 'https://www.in.gov/isda/files/Cover-Crop-Trends-2011-2019-Statewide.pdf'
urlretrieve(pdf,'cover_crop.pdf')
cover_crop = tabula.read_pdf('cover_crop.pdf', lattice=True, multiple_tables=True)

In [37]:
pdf = 'https://www.in.gov/isda/files/Conservation-Tillage-Trends-1990-2019-Statewide-2.pdf'
urlretrieve(pdf,'conservation_till.pdf')
conservation_till = tabula.read_pdf('conservation_till.pdf', lattice=True, multiple_tables=True)

Create dataframes for cover cropping and tillage practices using the pdf_df function defined above. Merge df's on year.

In [38]:
cover_crop_df = pdf_df(cover_crop)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe_acres2['Year'] = dataframe_acres2['Year'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe_acres2['Corn'] = [float(str(i).replace(",", "")) for i in dataframe_acres2['Corn']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe_acres2['Corn (Thousands of acres)']

2012 is missing from the data. Ceate a record for 2012 by using the observed_change_timeperiod function defined above.

In [39]:
cover_crop_2012 = cover_crop_df.loc[cover_crop_df['Year'] == 2011, 'Corn'].array + observed_change_timeperiod(cover_crop_df)
cover_crop_df = cover_crop_df.append({'Year':2012, 'Corn':cover_crop_2012[0], 'Corn (Thousands of acres)':(cover_crop_2012[0]/1000)}, ignore_index=True)
cover_crop_df.sort_values('Year', inplace=True, ignore_index=True)
cover_crop_df.columns = ['Year', 'Corn', 'IN Corn Cover Crop (Thousands of acres)']

In [40]:
cover_crop_final = cover_crop_df.loc[:, ['Year', 'IN Corn Cover Crop (Thousands of acres)']]

In [41]:
no_till_df = pdf_df(no_till)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe_acres2['Year'] = dataframe_acres2['Year'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe_acres2['Corn'] = [float(str(i).replace(",", "")) for i in dataframe_acres2['Corn']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe_acres2['Corn (Thousands of acres)']

In [42]:
missing_no_df = create_missing_practices(no_till_df, no_till_df['Year'], no_till_df['Corn'])

In [43]:
parsed_till = parse_table(no_till_df,missing_no_df)
parsed_till.pop('Corn (Thousands of acres)')
parsed_till['Total Corn Acres'] = calc_total_land_change(parsed_till)
parsed_till['Total Corn Acres'] = round(parsed_till['Total Corn Acres'],)

In [44]:
year_till = parsed_till.loc[:, ['Year','Total Corn Acres']]
year_till['IN No Till Corn (Thousands of acres)'] = year_till['Total Corn Acres']/1000
final_no_till = year_till.loc[:, ['Year','IN No Till Corn (Thousands of acres)']]

In [45]:
conservation_till_df = pdf_df(conservation_till)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe_acres2['Year'] = dataframe_acres2['Year'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe_acres2['Corn'] = [float(str(i).replace(",", "")) for i in dataframe_acres2['Corn']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe_acres2['Corn (Thousands of acres)']

In [46]:
missing_con_df = create_missing_practices(conservation_till_df, conservation_till_df['Year'], conservation_till_df['Corn'])

In [47]:
parsed_con_till = parse_table(conservation_till_df, missing_con_df)
parsed_con_till.pop('Corn (Thousands of acres)')
parsed_con_till['Total Corn Acres'] = calc_total_land_change(parsed_con_till)
parsed_con_till['Total Corn Acres'] = round(parsed_con_till['Total Corn Acres'],)

In [48]:
year_con_till = parsed_con_till.loc[:, ['Year','Total Corn Acres']]
year_con_till['IN Con Till Corn (Thousands of acres)'] = year_con_till['Total Corn Acres']/1000
final_con_till = year_con_till.loc[:, ['Year','IN Con Till Corn (Thousands of acres)']]

In [49]:
final_till = final_no_till.merge(final_con_till)
crop_practices_df = final_till.merge(cover_crop_final,how='left')
crop_practices_df.fillna(0, inplace=True)
final_crop_practices = crop_practices_df[(crop_practices_df['Year'] < 2018)]

Read and create the dataframes for US corn acerage, US total acerage, IN corn acerage, and IN corn yeild.

In [50]:
filename = 'US Corn'
data = f'/Users/coleromanyk/Documents/GitHub/Capstone-2/Data/{filename}.csv'
US_corn_raw = pd.read_csv(data)
US_corn_raw = US_corn_raw.reset_index(drop=True)
US_corn_raw['Year'] = US_corn_raw.iloc[:,1].astype('int')
US_corn_raw['Total US Corn Acres (Thousands of acres)'] = US_corn_raw.iloc[:,19].str.replace(",", "").astype('float')//1000

In [51]:
US_corn = US_corn_raw.loc[:,['Year','Total US Corn Acres (Thousands of acres)']]

In [52]:
filename = 'IN yield'
data = f'/Users/coleromanyk/Documents/GitHub/Capstone-2/Data/{filename}.csv'
IN_yield_raw = pd.read_csv(data)
IN_yield_raw = IN_yield_raw.reset_index(drop=True)
IN_yield_raw['Year'] = IN_yield_raw.iloc[:,1].astype('int')
IN_yield_raw['IN Corn Yield per Acre'] = IN_yield_raw.iloc[:,19]

In [53]:
IN_yield = IN_yield_raw.loc[:,['Year', 'IN Corn Yield per Acre']]

In [54]:
filename = 'IN Corn'
data = f'/Users/coleromanyk/Documents/GitHub/Capstone-2/Data/{filename}.csv'
IN_corn_raw = pd.read_csv(data)
IN_corn_raw = IN_corn_raw.reset_index(drop=True)
IN_corn_raw['Year'] = IN_corn_raw.iloc[:,1].astype('int')
IN_corn_raw['Total IN Corn Acres (Thousands of acres)'] = IN_corn_raw.iloc[:,19].str.replace(",", "").astype('float')//1000

In [55]:
IN_corn = IN_corn_raw.loc[:,['Year','Total IN Corn Acres (Thousands of acres)']]

In [56]:
filename = 'US Total'
data = f'/Users/coleromanyk/Documents/GitHub/Capstone-2/Data/{filename}.csv'
US_total_raw = pd.read_csv(data,)
US_total_raw = US_total_raw.T
US_total_raw.iloc[0,0] = 'Year'
US_total_raw.iloc[0,1] = 'US Total (Thousands of acres)'
US_total_raw.reset_index(drop=True,inplace=True)
US_total_raw.columns = US_total_raw.iloc[0,:]
US_year_total = US_total_raw.loc[1:,['Year','US Total (Thousands of acres)']].reset_index(drop=True)
US_year_total['Year'] = US_year_total['Year'].astype('int')
US_year_total['US Total (Thousands of acres)'] = US_year_total['US Total (Thousands of acres)'].str.replace(",", "").astype('int')

In [57]:
missing_US_total = create_missing_df(US_year_total, US_year_total['Year'], US_year_total['US Total (Thousands of acres)'])
parse_US_total = parse_table(US_year_total,missing_US_total)
parse_US_total['US Total (Thousands of acres)'] = calc_total_land_change(parse_US_total)
parse_US_total['US Total (Thousands of acres)'] = round(parse_US_total['US Total (Thousands of acres)'],)
US_total = parse_US_total.loc[:,['Year','US Total (Thousands of acres)']]
final_US_total = US_total[(US_total['Year'] >= 1990)]

In [58]:
IN_climate = pd.read_csv('/Users/coleromanyk/Documents/GitHub/Capstone-2/Data/IN Climate.csv', engine='python')
climate_data = IN_climate.iloc[:, 2:5]
climate_data.columns = ['Year', 'Precipitation', 'Average Temperature']

Merge the the acerage dfs with the practices dfs to create acres_and_practices df on year.

In [59]:
acres_df = final_US_total.merge(US_corn)
final_acres = acres_df.merge(IN_corn)
yield_acres = final_acres.merge(IN_yield)
climate_acres = yield_acres.merge(climate_data)
acres_and_practices = climate_acres.merge(final_crop_practices)

Merge all of the data into a single dataframe landuse_fertilizer_practices on year.

In [60]:
landuse_fertilizer_practices = landuse_fertilizer.merge(acres_and_practices)

Calculate full-till acres by subtracting conservation till acres and no-till acres from total IN corn acres

In [61]:
landuse_fertilizer_practices['Conventional Till'] = landuse_fertilizer_practices['Total IN Corn Acres (Thousands of acres)'] - landuse_fertilizer_practices['IN Con Till Corn (Thousands of acres)'] - landuse_fertilizer_practices['IN No Till Corn (Thousands of acres)']

Load emission data pdf and read with tabula, then calculate MMT of emissions per acre.

In [62]:
pdf = '/Users/coleromanyk/Documents/GitHub/Capstone-2/Data/US Cropland Greenhouse Gas Calculator.pdf'
emissions = tabula.read_pdf(pdf, multiple=True, pages='all')

In [63]:
conventional_till = table_split('conventional')

In [64]:
conservation_till = table_split('reduced')

In [65]:
no_till_emissions = table_split('no-till')

In [66]:
tillage_practice_emissions = pd.DataFrame()
tillage_practice_emissions['conventional'] = conventional_till.astype('float')
tillage_practice_emissions['conservation'] = conservation_till.astype('float')
tillage_practice_emissions['no-till'] = no_till_emissions.astype('float')

In [67]:
landuse_fertilizer_practices['Per Acre Emissions'] = (tillage_practice_emissions['conventional']*landuse_fertilizer_practices['Conventional Till'] \
                                     + tillage_practice_emissions['conservation']*landuse_fertilizer_practices['IN Con Till Corn (Thousands of acres)'] \
                                     + tillage_practice_emissions['no-till']*landuse_fertilizer_practices['IN No Till Corn (Thousands of acres)'])*.001

In [68]:
landuse_fertilizer_practices['efficiency'] = landuse_fertilizer_practices['IN Corn Yield per Acre']/landuse_fertilizer_practices['Per Acre Emissions']

In [69]:
landuse_fertilizer_practices.head()

Unnamed: 0,Year,Total Crop Land (Thousands of acres),Total Dev Land (Thousands of acres),Nitrogen,Phosphate,Potash,US Total (Thousands of acres),Total US Corn Acres (Thousands of acres),Total IN Corn Acres (Thousands of acres),IN Corn Yield per Acre,Precipitation,Average Temperature,IN No Till Corn (Thousands of acres),IN Con Till Corn (Thousands of acres),IN Corn Cover Crop (Thousands of acres),Conventional Till,Per Acre Emissions,efficiency
0,1990,13686.96,1967.36,139,75,111,461627.0,74166.0,5450.0,129,50.44,54.4,479.255,824.2,0.0,4146.545,3.306718,39.011486
1,1991,13618.98,1988.58,135,78,112,460835.0,75957.0,5550.0,92,37.56,54.8,723.426,1061.613,0.0,3764.961,3.508214,26.224166
2,1992,13551.0,2009.8,143,66,107,460044.0,79311.0,5970.0,147,44.22,52.6,967.598,1299.025,0.0,3703.377,3.376282,43.539021
3,1993,13527.82,2047.96,134,68,114,459046.0,73239.0,5400.0,132,50.78,52.0,1211.769,1536.438,0.0,2651.793,2.86059,46.144325
4,1994,13504.64,2086.12,147,74,112,458047.0,78921.0,5960.0,144,31.63,53.3,1131.817,1534.523,0.0,3293.66,3.331703,43.221141


Load and clean growing degrees days data.

In [70]:
gdd_data = pd.read_csv('/Users/coleromanyk/Documents/GitHub/Capstone-2/Data/corn_gdd.csv')

In [71]:
gdd_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15651 entries, 0 to 15650
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Year             15651 non-null  object 
 1   Month            15650 non-null  float64
 2   Day              15650 non-null  float64
 3   Accumulated GDD  15650 non-null  float64
dtypes: float64(3), object(1)
memory usage: 489.2+ KB


In [72]:
gdd_data.Year.unique()

array(['Average', 'Median', '1981', '1982', '1983', '1984', '1985',
       '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
       '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
       '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '2019', '2020', '2021', "''"], dtype=object)

In [73]:
non_year = ['Average', 'Median', "''"]

In [74]:
gdd_data = gdd_data[~gdd_data.Year.isin(non_year)]

In [75]:
gdd_data.Year = pd.to_numeric(gdd_data.Year)

In [76]:
year_range = list(range(1990, 2018))

In [77]:
gdd_filter = gdd_data[(gdd_data['Year'].isin(year_range)) & \
                      (gdd_data['Month'] == 10) & (gdd_data['Day'] == 10)]

In [78]:
gdd = gdd_filter.drop(['Month', 'Day'], axis=1)

Merge gdd data with landuse_fertilizer_practices data on year.

In [79]:
gdd_added = landuse_fertilizer_practices.merge(gdd, how='left')

Convert acrage data to ratios

In [80]:
gdd_added['full_till_ratio'] = gdd_added['Conventional Till']\
    / gdd_added['Total IN Corn Acres (Thousands of acres)']
    
gdd_added['conservation_till_ratio'] = 1 - gdd_added['full_till_ratio']
    
gdd_added['cover_crop_ratio'] = gdd_added['IN Corn Cover Crop (Thousands of acres)']\
    / gdd_added['Total IN Corn Acres (Thousands of acres)']

Remove spaces and uppercases from column names, filter out unnecessary features.

In [81]:
ratio_data = gdd_added[['Year', 'Nitrogen', 'Phosphate', 'Potash', 'Precipitation', \
                     'Average Temperature', 'Accumulated GDD',\
                     'full_till_ratio', 'cover_crop_ratio', 'efficiency']]
ratio_data['total_acres'] = gdd_added['Total IN Corn Acres (Thousands of acres)']
ratio_data['emissions'] = gdd_added['Per Acre Emissions']
ratio_data['yield'] = gdd_added['IN Corn Yield per Acre']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratio_data['total_acres'] = gdd_added['Total IN Corn Acres (Thousands of acres)']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratio_data['emissions'] = gdd_added['Per Acre Emissions']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratio_data['yield'] = gdd_added['IN Corn Yield per Acre']


In [82]:
ratio_data.columns = ratio_data.columns.str.replace(' ', '_')
ratio_data.columns = ratio_data.columns.str.lower()

In [83]:
ratio_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28 entries, 0 to 27
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   year                 28 non-null     object 
 1   nitrogen             28 non-null     object 
 2   phosphate            28 non-null     object 
 3   potash               28 non-null     object 
 4   precipitation        28 non-null     float64
 5   average_temperature  28 non-null     float64
 6   accumulated_gdd      28 non-null     float64
 7   full_till_ratio      28 non-null     float64
 8   cover_crop_ratio     28 non-null     float64
 9   efficiency           28 non-null     float64
 10  total_acres          28 non-null     float64
 11  emissions            28 non-null     float64
 12  yield                28 non-null     int64  
dtypes: float64(8), int64(1), object(4)
memory usage: 3.1+ KB


Save the ratio_data df as wrangled_data.csv

In [84]:
ratio_data.to_csv('/Users/coleromanyk/Documents/GitHub/Capstone-2/Data/wrangled_data.csv', index=False)