## Brainstorming
#### Analysis Questions 
- How many temperature readings in each month seemed inaccurate
    - I defined the outliers as being 2 standard deviations above or below the mean

#### Data Quality Assessment
- **First**:
    - Renaming all the columns to a shorter, simplified name
    - Needing to use `FAWN_raw.columns = FAWN_raw.columns.str.strip()` to make all the white spaces between letters, numbers, or symbols single spaces so all the spaces are uniform  
- Check data types using `.info()`
    - displays: Column names, types, non-null counts, memory
- Check statistical description using `.describe()`  
- Use `.value_counts()` for frequency count of each category (**need to do**)
- Finding outliers for max and min temperatures   
    - find `std()` for max and min temperatures, by month, for all three years using `groupby()`  

#### Analysis 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# loading Dataset from a CSV file #
FAWN_raw_initial = pd.read_csv("data/FAWN_report.csv")

FAWN_raw = pd.read_csv("data/FAWN_report.csv", parse_dates=["Period"], date_format="%d-%b-%y")
FAWN_clean = FAWN_raw
# FAWN_raw_spaces = FAWN_raw.columns.tolist()

# Cleaning raw data set


FAWN_clean.columns = FAWN_raw.columns.str.replace("  ", " ")

FAWN_clean.rename(
    columns={
        "2m T avg (F)": "Temp_avg (F)",
        "2m T min (F)": "T_min (F)",
        "2m T max (F)": "T_max (F)",
        "2m DewPt avg (F)": "DewPt_avg (F)",
        "RelHum avg 2m (pct)": "RelHum_avg (pct)",
        "2m Rain tot (in)": "Rain_tot (in)",
        "2m Rain max over 15min (in)": "Rain_max over 15min (in)",
        "SolRad avg2m (w/m^2)": "SolRad_avg @ 2m (w/m^2)",
        "10m Wind avg (mph)": "Wind_avg @ 10m (mph)",
        "10m Wind min (mph)": "Wind_min @ 10m (mph)",
        "10m Wind max (mph)": "Wind_max @ 10m (mph)",
        "WDir avg10m (deg)": "Wind_Dir_avg @ 10m (deg)",
        "BP avg (mb)": "Barametric_Pre_avg (mb)",
        "N (# obs)": "N (#obs)",
        "2m WetBulb (F)": "WetBulb @ 2m (F)",
    },
    inplace=True,
)


# FAWN_raw = FAWN_raw.replace('0', np.nan)
pd.set_option("display.width", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)

print(f"\n{FAWN_clean.dtypes}\n")
print(f"{FAWN_clean.isna().sum()}\n")

# print(f"{FAWN_raw_spaces}\n")
# print(f"{FAWN_raw.columns}\n")

print(f"\n{FAWN_clean.info()}\n")
# print(f"{FAWN_raw.describe()}\n")
print(f"Shape/dimensions of DataFrame: {FAWN_clean.shape}\n")
print(f"Number of ElemenFts: {FAWN_clean.size}\n")
# print(f"Number of Elements: {FAWN_clean['Period'].dt.month}

# Finding min temp outliers by month
FAWN_clean["Month"] = FAWN_clean["Period"].dt.month_name()
monthyly_std_T_min = FAWN_clean.groupby("Month")["T_min (F)"].std() * 2
monthyly_mean_T_min = FAWN_clean.groupby("Month")["T_min (F)"].mean()

# monthyly_mean_T_min = FAWN_clean.groupby('Month')['T_min (F)'].mean()
FAWN_clean["Day"] = FAWN_clean["Period"].dt.day_name()
# daily_T_min_byMonth = FAWN_clean.groupby('Month').agg({'Day':

print(f"Two times Std: \n{monthyly_std_T_min}\n")
print(f"Mean of each month: \n{monthyly_mean_T_min}\n")
two_std_overMean = monthyly_std_T_min + monthyly_mean_T_min
two_std_underMean = monthyly_mean_T_min - monthyly_std_T_min
# FAWN_raw_initial.head(20)

print(f"Two std above Mean: \n{two_std_overMean}\n")
print(two_std_underMean)

"""FAWN_clean.merge(
    two_std_underMean.rename('Two std above Temp_min Mean'), 
    left_on=FAWN_clean['Month'], 
    right_index=True, 
    how='left'
)"""

FAWN_clean["Two std above Temp_min Mean"] = FAWN_clean["Month"].map(two_std_overMean)
FAWN_clean["Two std below Temp_min Mean"] = FAWN_clean["Month"].map(two_std_underMean)

FAWN_clean = FAWN_clean.set_index("Period")
# print(FAWN_clean['T_min', 'Two std above Temp_min Mean (F)', 'Two std below Temp_min Mean'].head(20))
Temp_min_outlier = FAWN_clean[
    (FAWN_clean["T_min (F)"] > FAWN_clean["Two std above Temp_min Mean"])
    | (FAWN_clean["T_min (F)"] < FAWN_clean["Two std below Temp_min Mean"])
]
# print(Temp_min_outlier.count())

FAWN_clean.head(40)
Temp_min_outlier["T_min (F)"].head(20)

In [None]:
# To get the temperature data
temp_data = FAWN_clean["T_min (F)"].dropna()

# To calculate the statistics
mean_temp = temp_data.mean()
std_temp = temp_data.std()

# Define the outliers and non-outliers
upper_bound = mean_temp + 2 * std_temp
lower_bound = mean_temp - 2 * std_temp

# To identify the outliers, i.e, values outside the 2 std from the mean
outliers = temp_data[(temp_data > upper_bound) | (temp_data < lower_bound)]
non_outliers = temp_data[(temp_data <= upper_bound) & (temp_data >= lower_bound)]

# To create the plot
fig, ax = plt.subplots(figsize=(14, 7))

# Add the normal distribution curve and histogram plots
bin_edges = np.linspace(temp_data.min(), temp_data.max(), 51)
ax.hist(
    non_outliers,
    bins=bin_edges,
    alpha=0.6,
    color="blue",
    label="Non_outliers",
    density=True,
    edgecolor="black",
)
ax.hist(
    outliers,
    bins=bin_edges,
    alpha=0.9,
    color="red",
    label="outliers",
    density=True,
    edgecolor="darkred",
    linewidth=1.5,
)
normal_curve = stats.norm.pdf(bin_edges, mean_temp, std_temp)
ax.plot(bin_edges, normal_curve, "green", linewidth=3, label="Normal distribution")

# Let's add vertical lines for mean and standard deviation boundaries
ax.axvline(mean_temp, color="black", linestyle="--", linewidth=2, label="mean_temp in Fahrenheit")
ax.axvline(
    mean_temp + 2 * std_temp,
    color="orange",
    linestyle="--",
    linewidth=1.5,
    label="mean_temp + 2*std_temp in Fahrenheit",
)
ax.axvline(
    mean_temp - 2 * std_temp,
    color="red",
    linestyle="--",
    linewidth=1.5,
    label="mean_temp - 2*std_temp  in Fahrenheit",
)

# Let label and give title to the plots
ax.set_xlabel("Temperature in Fahrenheit", fontsize=12, fontweight="bold")
ax.set_ylabel("Density", fontsize=12, fontweight="bold")
ax.set_title(
    "Distribution of minimum temperature with outliers\n(2 standard deviation)",
    fontsize=14,
    fontweight="bold",
)
ax.legend(loc="upper right")
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()