## Brainstorming
#### Analysis Questions 
- How many temperature readings in each month seemed inaccurate
    - I defined the outliers as being 2 standard deviations above or below the mean

#### Data Quality Assessment
- **First**:
    - Renaming all the columns to a shorter, simplified name
    - Needing to use `FAWN_raw.columns = FAWN_raw.columns.str.strip()` to make all the white spaces between letters, numbers, or symbols single spaces so all the spaces are uniform  
- Check data types using `.info()`
    - displays: Column names, types, non-null counts, memory
- Check statistical description using `.describe()`  
- Use `.value_counts()` for frequency count of each category (**need to do**)
- Finding outliers for max and min temperatures   
    - find `std()` for max and min temperatures, by month, for all three years using `groupby()`  

#### Analysis 

In [27]:
import numpy as np
import pandas as pd

# loading Dataset from a CSV file #
FAWN_raw_initial = pd.read_csv("data/FAWN_report.csv")

FAWN_raw = pd.read_csv("data/FAWN_report.csv", parse_dates=['Period'], date_format= '%d-%b-%y')
FAWN_clean = FAWN_raw
#FAWN_raw_spaces = FAWN_raw.columns.tolist()

# Cleaning raw data set


FAWN_clean.columns = FAWN_raw.columns.str.replace("  ", " ")

FAWN_clean.rename(
    columns={
        "2m T avg (F)": "Temp_avg (F)",
        "2m T min (F)": "T_min (F)",
        "2m T max (F)": "T_max (F)",     
        "2m DewPt avg (F)": "DewPt_avg (F)",
        "RelHum avg 2m (pct)": "RelHum_avg (pct)",
        "2m Rain tot (in)": "Rain_tot (in)",
        "2m Rain max over 15min (in)": "Rain_max over 15min (in)",
        "SolRad avg2m (w/m^2)": "SolRad_avg @ 2m (w/m^2)",
        "10m Wind avg (mph)": "Wind_avg @ 10m (mph)",
        "10m Wind min (mph)": "Wind_min @ 10m (mph)",
        "10m Wind max (mph)": "Wind_max @ 10m (mph)",
        "WDir avg10m (deg)": "Wind_Dir_avg @ 10m (deg)",
        "BP avg (mb)": "Barametric_Pre_avg (mb)",
        "N (# obs)": "N (#obs)",
        "2m WetBulb (F)": "WetBulb @ 2m (F)",
    },
    inplace=True,
)



# FAWN_raw = FAWN_raw.replace('0', np.nan)
pd.set_option("display.width", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)

print(f"\n{FAWN_clean.dtypes}\n")
print(f"{FAWN_clean.isna().sum()}\n")

# print(f"{FAWN_raw_spaces}\n")
# print(f"{FAWN_raw.columns}\n")

print(f"\n{FAWN_clean.info()}\n")
#print(f"{FAWN_raw.describe()}\n")
print(f"Shape/dimensions of DataFrame: {FAWN_clean.shape}\n")
print(f"Number of ElemenFts: {FAWN_clean.size}\n")
#print(f"Number of Elements: {FAWN_clean['Period'].dt.month} 

#Finding min temp outliers by month 
FAWN_clean['Month'] = FAWN_clean['Period'].dt.month_name()
monthyly_std_T_min = FAWN_clean.groupby('Month')['T_min (F)'].std()*2 
monthyly_mean_T_min = FAWN_clean.groupby('Month')['T_min (F)'].mean()

#monthyly_mean_T_min = FAWN_clean.groupby('Month')['T_min (F)'].mean()
FAWN_clean['Day'] = FAWN_clean['Period'].dt.day_name()
#daily_T_min_byMonth = FAWN_clean.groupby('Month').agg({'Day': 

print(f"Two times Std: \n{monthyly_std_T_min}\n")
print(f"Mean of each month: \n{monthyly_mean_T_min}\n")
two_std_overMean = monthyly_std_T_min + monthyly_mean_T_min
two_std_underMean = monthyly_mean_T_min - monthyly_std_T_min
#FAWN_raw_initial.head(20)

print(f"Two std above Mean: \n{two_std_overMean}\n")
print(two_std_underMean)

'''FAWN_clean.merge(
    two_std_underMean.rename('Two std above Temp_min Mean'), 
    left_on=FAWN_clean['Month'], 
    right_index=True, 
    how='left'
)'''

FAWN_clean['Two std above Temp_min Mean'] = FAWN_clean['Month'].map(two_std_overMean)
FAWN_clean['Two std below Temp_min Mean'] = FAWN_clean['Month'].map(two_std_underMean)

FAWN_clean = FAWN_clean.set_index('Period')
#print(FAWN_clean['T_min', 'Two std above Temp_min Mean (F)', 'Two std below Temp_min Mean'].head(20))
Temp_min_outlier = FAWN_clean[
    (FAWN_clean['T_min (F)'] > FAWN_clean['Two std above Temp_min Mean']) | 
    (FAWN_clean['T_min (F)'] <  FAWN_clean['Two std below Temp_min Mean'])
    ] 
#print(Temp_min_outlier.count())

FAWN_clean.head(40)
Temp_min_outlier['T_min (F)'].head(20) 




FAWN Station                        object
Period                      datetime64[ns]
Temp_avg (F)                       float64
T_min (F)                          float64
T_max (F)                          float64
DewPt_avg (F)                      float64
RelHum_avg (pct)                     int64
Rain_tot (in)                      float64
Rain_max over 15min (in)           float64
SolRad_avg @ 2m (w/m^2)            float64
Wind_avg @ 10m (mph)               float64
Wind_min @ 10m (mph)               float64
Wind_max @ 10m (mph)               float64
Wind_Dir_avg @ 10m (deg)             int64
Barametric_Pre_avg (mb)              int64
N (#obs)                             int64
WetBulb @ 2m (F)                   float64
dtype: object

FAWN Station                0
Period                      0
Temp_avg (F)                0
T_min (F)                   0
T_max (F)                   0
DewPt_avg (F)               0
RelHum_avg (pct)            0
Rain_tot (in)               0
Rain_max over

Period
2022-03-13    35.10
2023-07-05    78.98
2023-07-06    77.79
2023-07-09    79.36
2023-07-21    79.07
2024-06-09    78.44
2024-07-12    77.79
2024-07-26    78.80
2024-08-07    81.45
2024-08-08    81.09
2022-01-01    72.46
2022-02-14    26.37
2022-03-02    34.19
2022-03-12    31.98
2022-03-13    26.66
2022-03-14    31.29
2022-04-09    39.17
2022-04-10    40.05
2022-05-08    55.22
2022-05-10    55.11
Name: T_min (F), dtype: float64