# SWB rainfall processing and consolidataion
## This script takes data from multiple sources including 
- Matt/Chris-ASPA stations
- Tula NOAA weather station|
- The airport weather station
- the ASCC weather station


## Note that all units in this script are in inches of rain

In [1]:
# make the screen bigger!
#pd.options.display.max_rows = 100

from IPython.display import display, HTML

display(HTML(data=""" <style>    div#notebook-container    { width: 100%; }    div#menubar-container     { width: 85%; }
div#maintoolbar-container { width: 99%; }</style>"""))

import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy
from scipy import stats
from dateutil.relativedelta import relativedelta
from datetime import date, datetime, timedelta
import datetime
import random
import shutil   # use to move PDF out of the directory
import re
from itertools import chain

%matplotlib notebook
pd.options.display.max_rows = 500

# paths to files

path_ASPA_data = os.path.join('..', '..', '..', 'ASPA-UH_Wx_REPO', 'workspace', 'QA_All_merged.csv')               # note that this file was manually modified to tak put 3-2015 to 6-2015 stupid data
path_Tula_data = os.path.join('.', 'Rain_data\\Tula_Wx_data\\Tula_raw')
path_Airport_data = os.path.join('.', 'Rain_data\\Airport_Wx_data\\Airport_rainfall_1957-2017.csv')
path_ASCC_data = os.path.join('.', 'Rain_data\\ASCC_Wx_data\\ASCCWeatherData_2015.csv')
path_Ito_data = os.path.join('.', 'Rain_data\\Ito_rain_data\\Corrected_ito_rain_data.csv')
path_NCDC_data = os.path.join('.', 'Rain_data\\NCDC_rain_data\\NCDC_precip_D_consolidated.xlsx')
path_USGS_data = os.path.join('.', 'Rain_data\\USGS_rain_data\\USGS_precip_D_consolidated.xlsx')
path_WRCC_data = os.path.join('.', 'Rain_data\\WRCC_data')

# Process Matt/Chris-ASPA stations into: 
### ASPA_rain_daily data frame

In [9]:
# importing ASPA data
%matplotlib notebook
All_merged = pd.read_csv(os.path.join(path_ASPA_data), low_memory=False)
All_merged['DateTime'] = pd.to_datetime(All_merged['DateTime'], errors='coerce')
Rain_merged = All_merged[['DateTime', 'Aasu_RNF_in','Vaipito_RNF_in', 'Fagaitua_RNF_in', 'Afono_RNF_in']].copy()   #  'Alava_RNF_in', 'Poloa_RNF_in',  have no values for certain months so omit them here
lop_cols = ['Aasu_RNF_in','Vaipito_RNF_in', 'Fagaitua_RNF_in', 'Afono_RNF_in']       

All_merged.columns

Index(['Unnamed: 0', 'DateTime', 'Aasu_SRD_wpm2', 'Aasu_HMD_pct', 'Aasu_TMP_F',
       'Aasu_RNF_in', 'Aasu_WND_deg', 'Aasu_WNS_mph', 'Alava_SRD_wpm2',
       'Alava_HMD_pct', 'Alava_TMP_F', 'Alava_RNF_in', 'Alava_WND_deg',
       'Alava_WNS_mph', 'Poloa_SRD_wpm2', 'Poloa_HMD_pct', 'Poloa_TMP_F',
       'Poloa_RNF_in', 'Poloa_WND_deg', 'Poloa_WNS_mph', 'Vaipito_SRD_wpm2',
       'Vaipito_HMD_pct', 'Vaipito_TMP_F', 'Vaipito_RNF_in', 'Vaipito_WND_deg',
       'Vaipito_WNS_mph', 'Fagaitua_SRD_wpm2', 'Fagaitua_HMD_pct',
       'Fagaitua_TMP_F', 'Fagaitua_RNF_in', 'Fagaitua_WND_deg',
       'Fagaitua_WNS_mph', 'Afono_SRD_wpm2', 'Afono_HMD_pct', 'Afono_TMP_F',
       'Afono_RNF_in', 'Afono_WND_deg', 'Afono_WNS_mph', 'ASPAsolar_SRD_wpm2',
       'ASPAsolar_HMD_pct', 'ASPAsolar_TMP_F', 'ASPAsolar_RNF_in',
       'ASPAsolar_WND_deg', 'ASPAsolar_WNS_mph'],
      dtype='object')

In [42]:
# importing ASPA data
%matplotlib notebook
All_merged = pd.read_csv(os.path.join(path_ASPA_data), low_memory=False)
All_merged['DateTime'] = pd.to_datetime(All_merged['DateTime'], errors='coerce')
Rain_merged = All_merged[['DateTime', 'Aasu_RNF_in','Vaipito_RNF_in',  'Afono_RNF_in']].copy()   #  'Alava_RNF_in', 'Poloa_RNF_in', 'Fagaitua_RNF_in',  have no values for certain months so omit them here
lop_cols = ['Aasu_RNF_in','Vaipito_RNF_in', 'Afono_RNF_in']                                        #  'Alava_RNF', 'Poloa_RNF',  'Fagaitua_RNF_in', 

ASPA_rain_daily = pd.DataFrame(columns=['DateTime', 'Aasu_RNF_in','Vaipito_RNF_in',  'Afono_RNF_in'])   # 'Alava_RNF', 'Poloa_RNF', 'Fagaitua_RNF_in',
for i in lop_cols:
    Rain_merged[i] = Rain_merged[i].apply(lambda x: float(x))  
    rain = Rain_merged[['DateTime', i]]
    day_rain = rain.set_index('DateTime').resample('D')[i].sum(min_count=1)     # sum the rainfall by days   (need the min count value so that the resample operation does not sum NaN values into 0s )
    day_rain_frame = pd.DataFrame(day_rain)                                   # worst coding ever, take out the index (for above function tha tonly works on the index) then put index back in...terrible 
    day_rain_indexed = day_rain_frame.reset_index(drop=False)                 
    ASPA_rain_daily['DateTime'] = day_rain_indexed['DateTime']
    ASPA_rain_daily[i] = day_rain_indexed[i]
ASPA_rain_daily.rename(columns={'DateTime': 'Date'}, inplace=True) 

# plotting just ASPA stations to make sure it all worked
#stations = ['Aasu_RNF', 'Vaipito_RNF', 'Afono_RNF']  # 'Fagaitua_RNF'      'Alava_RNF', 'Poloa_RNF',
#fig = plt.figure(figsize=(15, 15))
#p = 0
#q = len(stations)   # number of stations to regress

#for i in stations:
#    for m in stations:
#        x =  ASPA_rain_daily[i]
#        y =  ASPA_rain_daily[m]
#        
#        mask = ~np.isnan(x) & ~np.isnan(y)
#        try:
#            slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x[mask], y[mask])   # calculate regression stats for the ecoli and rainfall data
#            r2 = r_value**2
#            rX = range(int(min(x[mask])),int(max(x[mask]))+2)
#           rY = slope*rX + intercept
#       except: print(i +'- or -'+ m+ ' messed it up for everyone')
#            
#        # plotting stuff
#        p = p+1
#        ax1 = fig.add_subplot(q,q,p)
#        ax1.set_xlabel(i), ax1.set_ylabel(m)
#        ax1.scatter(x, y, label='', alpha = .6, marker='.')
#        ax1.plot(rX,rY, color='g', linestyle='--', alpha = .6 , label = "r$^2$ = " + "$%.2f$"%r2 ) 
#        plt.tight_layout()
#        plt.legend(loc='upper right') 
#        plt.show()

# Process individual years worth of Tula data into: 
### Tula_rain_daily data frame
Note that more years of data could be integrated into this by dropping other year files into the tula data path folder

Note that all data from any time before 1987 and the 1992 year did not have precip data so these files should be removed from the file processing folder 

In [43]:
files = os.listdir(path_Tula_data)
cols = ['SITE CODE', 'YEAR', 'MONTH', 'DAY', 'HOUR', 'WND', 'WNS_mpsec', 'WND_STEADY', 'BAR_p', 'TMP_2m', 'TMP_10m', 'TMP_top', 'HMD', 'PRECIP_mm']
na_values = ['-99', '-9', '-999', '-99.9', '-999.9']   #values to turn into NaN 
Tula_data =  pd.DataFrame(columns=cols)
Tula_rain_daily = pd.DataFrame(columns=['Date', 'RNF_Tula'])

for i in files: 
    q = pd.read_csv(os.path.join(path_Tula_data, i), delim_whitespace=True, names=cols, na_values=na_values)
    Tula_data = pd.concat([Tula_data, q], axis=0)
    
Tula_data = Tula_data.reset_index(drop=True)                                # reset the row index to start from 0
Tula_data['RNF'] =  Tula_data['PRECIP_mm'].apply(lambda x: x*0.0393701)     # convert rain in mm to rain in inches
Tula_data['Date'] = Tula_data[['YEAR', 'MONTH', 'DAY', 'HOUR']].apply(lambda s : datetime.datetime(*s),axis = 1)

# This is the group by day function 
Tula_day_rain_series = Tula_data.groupby(['SITE CODE', pd.Grouper(key='Date', freq='D')])['RNF'].sum() 

Tula_rain_daily = Tula_day_rain_series.reset_index()
del Tula_rain_daily['SITE CODE']
Tula_rain_daily.rename(columns={'RNF': 'Tula_RNF'}, inplace=True) 

# Process Airport data into: 
### Airport_rain_daily data frame
Note that ths file was already downloaded in daily format

In [44]:
Airport_rain = pd.read_csv(os.path.join(path_Airport_data))
Airport_rain['Date'] = pd.to_datetime(Airport_rain['Date'], errors='coerce')
Airport_rain.rename(columns={'Precip': 'Airport_RNF'}, inplace=True) 
Airport_rain['Airport_RNF'] = pd.to_numeric(Airport_rain['Airport_RNF'], errors='coerce')

Airport_rain_daily = Airport_rain

# Process ASCC weather station data into: 
### ASCC_rain_daily data frame


In [45]:
ASCC_All = pd.read_csv(os.path.join(path_ASCC_data), parse_dates=[['Date', 'Time']],  keep_date_col=True)
ASCC_rain = ASCC_All[['Date_Time','Rain']].copy()                          # pull out just the rain column
ASCC_rain['Rain'] = pd.to_numeric(ASCC_rain['Rain'], errors='coerce')

ASCC_rain['ASCC_RNF'] =  ASCC_rain['Rain'].apply(lambda x: x*0.0393701)     # convert rain in mm to rain in inches

ASCC_day_rain = ASCC_rain.set_index('Date_Time').resample('D').sum()     # sum the rainfall by days
ASCC_day_rain_frame = pd.DataFrame(ASCC_day_rain)                        # make the series into a DF
ASCC_rain_daily = ASCC_day_rain_frame.reset_index(drop=False)            # pull the date index to a date column
ASCC_rain_daily.rename(columns={'Date_Time': 'Date'}, inplace=True)      # rename column
del ASCC_rain_daily['Rain']                                              # del mm rainfall column

  interactivity=interactivity, compiler=compiler, result=result)


# Process Random historical NCDC data into: 
### NCDC_rain_daily data frame

In [46]:
ncdcdat = pd.read_excel(path_NCDC_data, None)

NCDC_rain_daily = pd.DataFrame(columns=['Date'])

for i in ncdcdat.keys():
    ncdcdat[i]['PRCP'] =  ncdcdat[i]['PRCP'].apply(lambda x: x*0.00393701)     # convert rain in mm to rain in inches
    ncdcdat[i] = ncdcdat[i].rename(columns={"date": "Date", "PRCP" :i+"_NCDC_RNF" })
    ncdcdat[i] = ncdcdat[i].replace(-9999, np.nan) 
    del ncdcdat[i]['Measurement Flag']
    ncdcdat[i]['Date'] = pd.to_datetime(ncdcdat[i]['Date'], errors='coerce')
    ncdcdat[i] = ncdcdat[i][ncdcdat[i][i+"_NCDC_RNF"] > -1]
    
    NCDC_rain_daily = NCDC_rain_daily.merge(ncdcdat[i], how='outer', on='Date')
NCDC_rain_daily = NCDC_rain_daily.sort_values(by='Date')
NCDC_rain_daily = NCDC_rain_daily.reset_index(drop=True)

# Process historical USGS data into: 
### USGS_rain_daily data frame

In [47]:
USGSdat = pd.read_excel(path_USGS_data, None)
USGS_rain_daily = pd.DataFrame(columns=['Date'])

for i in USGSdat.keys():
    USGSdat[i] = USGSdat[i].rename(columns={"date": "Date", "Precip(in)" :i+"_USGS_RNF" })
    USGSdat[i] = USGSdat[i].replace(-99, np.nan) 
    USGSdat[i]['Date'] = pd.to_datetime(USGSdat[i]['Date'], errors='coerce')
    
    USGS_rain_daily = USGS_rain_daily.merge(USGSdat[i], how='outer', on='Date')
USGS_rain_daily = USGS_rain_daily.sort_values(by='Date')
USGS_rain_daily = USGS_rain_daily.reset_index(drop=True)   

# Process random WRCC data into: 
### WRCC_rain_daily data frame

In [48]:
files = os.listdir(path_WRCC_data)
cols = ['Year', 'Month', 'Day', 'Obs', 'Max', 'Min', 'Avg', 'junk1', 'Junk2', 'Total', 'SNFL', 'SD']
na_values = ['###M'  , '###M' , '#####M', '#####M' , '####M']

WRRC_rain_daily = pd.DataFrame(columns=['Date'])

for i in files:    
    keyn = i.split('.')[0]
    a = pd.read_csv(os.path.join(path_WRCC_data, i), delim_whitespace=True, index_col=False,names=cols,   na_values=na_values)
    a ['Date'] = pd.to_datetime(a[['Year', 'Month', 'Day']])
    frame = a[['Date','Total']]
    frame = frame.rename(columns={"Total" :keyn+"_WRCC_RNF" })
    
    WRRC_rain_daily = WRRC_rain_daily.merge(frame, how='outer', on='Date')

In [49]:
All_rain_daily.columns

Index(['Date', 'Tula_RNF', 'Aasu_RNF_in', 'Vaipito_RNF_in', 'Fagaitua_RNF_in',
       'Afono_RNF_in', 'Airport_RNF', 'ASCC_RNF', 'Aoloaufou_USGS_RNF',
       'Aua_USGS_RNF', 'Fagaitua_USGS_RNF', 'Malaeimi_USGS_RNF',
       'Pioa_USGS_RNF', 'Satala_USGS_RNF', 'Amouli_NCDC_RNF', 'Atuu_NCDC_RNF',
       'Fagaalu_NCDC_RNF', 'Malaeloa_NCDC_RNF', 'Vatia_NCDC_RNF',
       'taputimu_WRCC_RNF', 'day', 'month_year'],
      dtype='object')

# Merge up all the sites into one dataframe


In [50]:
# merge up all the files 
All_rain_daily = Tula_rain_daily.merge(ASPA_rain_daily, how='outer', on='Date')
All_rain_daily = All_rain_daily.merge(Airport_rain_daily, how='outer', on='Date')
All_rain_daily = All_rain_daily.merge(ASCC_rain_daily, how='outer', on='Date')
All_rain_daily = All_rain_daily.merge(USGS_rain_daily, how='outer', on='Date')
All_rain_daily = All_rain_daily.merge(NCDC_rain_daily, how='outer', on='Date')
All_rain_daily = All_rain_daily.merge(WRRC_rain_daily, how='outer', on='Date')

All_rain_daily = All_rain_daily.sort_values(by='Date')
All_rain_daily = All_rain_daily.reset_index(drop=True)   


# merge up the Aasu stations
All_rain_daily['Aasu_RNF_in'] = All_rain_daily['Aasu_RNF_in'].fillna(All_rain_daily['Aasufou_USGS_RNF'])
All_rain_daily['Aasu_RNF_in'] = All_rain_daily['Aasu_RNF_in'].fillna(All_rain_daily['Aasufou_NCDC_RNF'])
All_rain_daily['Aasu_RNF_in'] = All_rain_daily['Aasu_RNF_in'].fillna(All_rain_daily['aasufou_WRCC_RNF'])

del All_rain_daily['Aasufou_USGS_RNF']
del All_rain_daily['Aasufou_NCDC_RNF']
del All_rain_daily['aasufou_WRCC_RNF']

# merge up the Afono stations
All_rain_daily['Afono_RNF_in'] = All_rain_daily['Afono_RNF_in'].fillna(All_rain_daily['afono_WRCC_RNF'])
All_rain_daily['Afono_RNF_in'] = All_rain_daily['Afono_RNF_in'].fillna(All_rain_daily['Afono_NCDC_RNF'])

del All_rain_daily['afono_WRCC_RNF']
del All_rain_daily['Afono_NCDC_RNF']


# merge up the Vaipito stations
All_rain_daily['Vaipito_RNF_in'] = All_rain_daily['Vaipito_RNF_in'].fillna(All_rain_daily['VaipitoDivs_USGS_RNF'])
All_rain_daily['Vaipito_RNF_in'] = All_rain_daily['Vaipito_RNF_in'].fillna(All_rain_daily['Vaipitores_USGS_RNF'])

del All_rain_daily['VaipitoDivs_USGS_RNF']
del All_rain_daily['Vaipitores_USGS_RNF']

# useliess staions 
del All_rain_daily['PagoAirport_USGS_RNF']

All_rain_daily.to_csv('All_rain_daily.csv')


# Now process consolidated rainfall data into a fragments file

Read in the data and put together a string column that represents unique months, then make a list of those months 

In [51]:
data = All_rain_daily
# make a dope month-year column
data['month'] = data['Date'].apply(lambda x: x.month)    
data['year'] = data['Date'].apply(lambda x: x.year)
data['day'] = data['Date'].apply(lambda x: x.day)
data['month_str'] = data['month'].apply(lambda x: str(x))
data['yr_str'] = data['year'].apply(lambda x: str(x))
data['month_year'] = data['month_str']+"-"+data['yr_str']
del data['month'], data['year'], data['month_str'], data['yr_str']

unique_monthYrs = list(data['month_year'].unique())     # list of all the individual months in dataframe 

make a frame with each month yr and the sum of all rainfall that fell during that month for each station 

In [52]:
Rain_totals = pd.DataFrame(data={'MoYo': unique_monthYrs})                              # the dataframe of moyears and rainfall sums
for station in data.columns[1:-2]:
    sum_list = []
    for i in unique_monthYrs:
        a = data[data['month_year'] == i]
        mo_sum = a[station].sum(min_count=28)                                                        # take precip data to numeric and sum the result
        sum_list.append(mo_sum)     
    Rain_totals[station] = sum_list

## Compile the Fragments file with all station data

In [53]:
final_fragments_cols = ['Month', 'Gage_ID', 'Fragment_Set', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'day_7', 'day_8', 'day_9', 'day_10', 'day_11', 'day_12', 'day_13', 'day_14', 'day_15', 'day_16', 'day_17', 'day_18', 'day_19', 'day_20', 'day_21', 'day_22', 'day_23', 'day_24', 'day_25', 'day_26', 'day_27', 'day_28', 'day_29', 'day_30', 'day_31']
final_fragments = pd.DataFrame(columns=final_fragments_cols) 

for numba, station in enumerate(data.columns[1:-2]):

    each_station = data[['Date', station, 'day','month_year']]
    good_frames = pd.DataFrame(columns= list(each_station.columns)) 
    each_station_Rain_totals = Rain_totals[['MoYo', station]]

    try: 
        for moyos in unique_monthYrs:                                                 # this loop will remove any months where there are ANY NaN values from the given station 
            each_month_frame =  each_station[each_station['month_year'] == moyos]   
            if pd.notnull(each_month_frame[station].sum(skipna=False)):
                good_frames = pd.concat([good_frames, each_month_frame], axis=0)

        good_mo_years = list(good_frames['month_year'].unique())
        bigger_frame = pd.DataFrame(columns=final_fragments_cols) 
        for i in good_mo_years: 
            mini_month_frame = good_frames[good_frames['month_year'] == i]

            frags = []               
            for index, row in mini_month_frame.iterrows():                                    # Loping over each row in full station set of actual rain data   
                a = mini_month_frame[station][index] / each_station_Rain_totals[each_station_Rain_totals['MoYo'] == i][station].values[0]    # Caluclate the fragment value for the given actual rain value                                                    
                frags.append(a)
            frag_series = pd.Series(frags)  
            mini_month_frame['fragment'] = frag_series.values

            stick2_bigger_frame = pd.DataFrame(columns=final_fragments_cols, index= [0])  
            stick2_bigger_frame['Month'] =  mini_month_frame['month_year'].values[2].split('-')[0]   # add the month
            stick2_bigger_frame['Gage_ID'] =  station
            stick2_bigger_frame['Fragment_Set'] = mini_month_frame['month_year'].values[2].split('-')[1]  # add the year / fragment set

            for idx, y in enumerate(list(mini_month_frame['fragment'])):                # loop over each day
                stick2_bigger_frame.iloc[0, idx+3] = y                    # stick the days  fragment in the right column

            bigger_frame = bigger_frame.append(stick2_bigger_frame)         # stick the row onto the fragments frame           
        final_fragments = final_fragments.append(bigger_frame)   
    except:
        print('Issue with {} in {}'.format(station, moyos))
        
final_fragments = final_fragments.reset_index(drop=True)                 # reindex to start at 0
#final_fragments = final_fragments.assign(Gage_ID=(final_fragments['Gage_ID']).astype('category').cat.codes)  # Assign a gauge ID based on unique gauge names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [54]:
final_fragments

Unnamed: 0,Month,Gage_ID,Fragment_Set,day_1,day_2,day_3,day_4,day_5,day_6,day_7,...,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31
0,1,Tula_RNF,1987,0,0,0,0,0,0,0.109948,...,0.0052356,0.0052356,0,0,0.0418848,0.0314136,0.0052356,0.0104712,0.0157068,0.34555
1,2,Tula_RNF,1987,0.030303,0.013468,0.30303,0.141414,0.003367,0.013468,0,...,0,0,0.003367,0,0.461279,0.003367,0,,,
2,3,Tula_RNF,1987,0.294521,0.00684932,0,0,0.0342466,0.00684932,0.0753425,...,0,0,0,0,0,0,0,0.308219,0,0
3,4,Tula_RNF,1987,0.170455,0.102273,0,0.0909091,0.215909,0,0,...,0.0113636,0.136364,0,0.136364,0.0113636,0,0.0340909,0,0.0340909,
4,5,Tula_RNF,1987,0,0,0,0,0,0,0.166667,...,0,0,0,0,0,0,0.5,0,0,0
5,6,Tula_RNF,1987,0,0,0,0,0,0,0.0384615,...,0.163462,0.00961538,0,0,0,0.00961538,0,0,0,
6,7,Tula_RNF,1987,0,0,0,0,0,0,0.0344828,...,0,0,0,0,0,0,0,0.0344828,0,0
7,8,Tula_RNF,1987,0,0,0,0.176471,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,9,Tula_RNF,1987,,,,,,,,...,,,,,,,,,,
9,10,Tula_RNF,1987,0,0,0,0.0277778,0,0.277778,0.0833333,...,0,0,0,0,0,0.0277778,0,0.0277778,0,0


In [55]:
final_final_fragments = pd.DataFrame(columns = ['Month', 'Gage_ID', 'Fragment_Set', 'day_1', 'day_2', 'day_3', 'day_4',
       'day_5', 'day_6', 'day_7', 'day_8', 'day_9', 'day_10', 'day_11',
       'day_12', 'day_13', 'day_14', 'day_15', 'day_16', 'day_17', 'day_18',
       'day_19', 'day_20', 'day_21', 'day_22', 'day_23', 'day_24', 'day_25',
       'day_26', 'day_27', 'day_28', 'day_29', 'day_30', 'day_31']) 


site = []; gid = []; count = []
for numbah, i in enumerate(final_fragments['Gage_ID'].unique()):
    tmp_frame =  final_fragments[final_fragments['Gage_ID'] == i]
    tmp_frame = tmp_frame.assign(Frag_Set_ID=(tmp_frame['Fragment_Set']).astype('category').cat.codes)  # Assign a unique year number based on the number of years in the dataset
    tmp_frame['Frag_Set_ID'] =  tmp_frame['Frag_Set_ID']+1           # start index at one
    tmp_frame = tmp_frame.reset_index(drop=True)                     # dumb step needed to call up the name of the gauge
    spot = tmp_frame['Gage_ID'][1]  ; site.append(spot)              # record the gauge ID in a list
    tmp_frame['Gage_ID'] =  numbah+1                                 # assign the unique number instead of the name
    spotnum = tmp_frame['Gage_ID'][1]  ; gid.append(spotnum)         # record the gauge ID in a list
    tmp_frame['Fragment_Set'] = tmp_frame['Frag_Set_ID']             # replace
    count.append(len(tmp_frame['Fragment_Set'].unique()))            # add the number of years of data available
    
    del tmp_frame['Frag_Set_ID']                                     # remove useless colums
    final_final_fragments =  pd.concat([final_final_fragments,tmp_frame], axis=0)   # stick the frame from this gauge ont the big frame with all gauges
    

pd.DataFrame(list(zip(site, gid, count)), columns=['Gage','Gage_ID', 'Count'])      # make a key dataframe relating the name to the ID number and count number of avaialbe years

Unnamed: 0,Gage,Gage_ID,Count
0,Tula_RNF,1,30
1,Aasu_RNF_in,2,36
2,Vaipito_RNF_in,3,42
3,Afono_RNF_in,4,20
4,Airport_RNF,5,61
5,ASCC_RNF,6,4
6,Aoloaufou_USGS_RNF,7,9
7,Aua_USGS_RNF,8,4
8,Fagaitua_USGS_RNF,9,11
9,Malaeimi_USGS_RNF,10,19


#### now I need to modify the fragments file to fit SWB's formatting requirements-
basically this is doing a random selection of the fragments files into a couple of different sets, Note that this is un-ideal

In [58]:
month_nums = [1,2,3,4,5,6,7,8,9,10,11,12]
month_nums = ['1','2','3','4','5','6','7','8','9','10','11','12']
ID_nums = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]
Set_nums = [1,2,3,4,5,6,7,8,9,10]
Stu_fragments_File = pd.DataFrame(columns=['Month', 'Gage_ID', 'Fragment_Set', 'day_1', 'day_2', 'day_3', 'day_4',
       'day_5', 'day_6', 'day_7', 'day_8', 'day_9', 'day_10', 'day_11',
       'day_12', 'day_13', 'day_14', 'day_15', 'day_16', 'day_17', 'day_18',
       'day_19', 'day_20', 'day_21', 'day_22', 'day_23', 'day_24', 'day_25',
       'day_26', 'day_27', 'day_28', 'day_29', 'day_30', 'day_31'])

for s in Set_nums:
    for m in month_nums:
        for f in ID_nums:
            try:
                subset_1 = final_final_fragments[final_final_fragments['Gage_ID'] == f]
                subset_2 = subset_1[subset_1['Month'] == m]
                subset_3 = subset_2.sample(1)
                subset_3['Fragment_Set'] = s
            except: 
                print("month{}, IDnum_{} messed it up".format(m,f))
            
            Stu_fragments_File =  pd.concat([Stu_fragments_File,subset_3], axis=0) 
Stu_fragments_File = Stu_fragments_File.reset_index(drop=True)  
Stu_fragments_File = Stu_fragments_File.astype(np.double).round(4)
Stu_fragments_File['Month'] = Stu_fragments_File['Month'].astype(np.int)
Stu_fragments_File['Gage_ID'] = Stu_fragments_File['Gage_ID'].astype(np.int)
Stu_fragments_File['Fragment_Set'] = Stu_fragments_File['Fragment_Set'].astype(np.int)
Stu_fragments_File = Stu_fragments_File.fillna(value=0)

In [59]:
Frag_num = 3   # give the fragment set a unique identifier

Stu_fragments_File.to_csv('Rainfall_fragments_{}.prn'.format(Frag_num), sep=' ', index=False, header=False)       # record fragment set into a csv 

# Create Sequence file 

In [60]:
# use this to make the sequence file unique if want
Sq_num = 3

In [61]:
sim_nums = [1,2,3,4,5]
month_nums = [1,2,3,4,5,6,7,8,9,10,11,12]
yr = [1,2,3,4,5,6,7,8,9,10]


Stu_fragments_File['Month'] = Stu_fragments_File['Month'].astype(int)

Sequence_File = pd.DataFrame(columns=["simulation", "month", "frag_zone", "year", "random_number", "selected_set"])

for s in sim_nums:
    for m in month_nums:
        for f in Stu_fragments_File['Gage_ID'].unique():
            for y in yr:
                RN = random.random()
                subset_1 = Stu_fragments_File[Stu_fragments_File['Gage_ID'] == f]
                subset_2 = subset_1[subset_1['Month'] == m]
                if subset_2.empty:                                          #  Needed to handle instances where there is no fragment set for a given 
                    da_set ="THIS WILL MESS IT ALL UP!!!"
                else:
                    row_3 = subset_2.sample(1)
                    da_set = row_3['Fragment_Set'].values[0]
                    
                tmpframe = pd.DataFrame(data = {"simulation":[s], "month":[m], "frag_zone":f, "year":y,'random_number':RN, 'selected_set':da_set})
                Sequence_File =  pd.concat([Sequence_File,tmpframe], axis=0) 


In [63]:
Sequence_File.to_csv('Sequence_file_{}.prn'.format(Sq_num), sep=' ', index=False)