# Q2: TEMPERATURE DATA ANALYSIS

In [28]:
# LIBRARIES
import os
import glob
import pandas as pd

In [29]:
csv_files = glob.glob(os.path.join("temperatures", "*.csv"))
dataframes = []
for file in csv_files:
    df = pd.read_csv(file)
    dataframes.append(df)

df = pd.concat(dataframes, ignore_index=True)
df.head()


Unnamed: 0,STATION_NAME,STN_ID,LAT,LON,January,February,March,April,May,June,July,August,September,October,November,December
0,ADELAIDE-KENT-TOWN,23090,-34.92,138.62,31.48,31.37,28.12,24.81,21.28,17.92,17.2,17.87,20.54,22.98,26.65,28.38
1,ALBANY-AIRPORT-COMPARISON,9741,-34.94,117.8,25.24,26.03,25.45,23.5,20.55,18.0,16.95,17.02,18.34,19.52,21.85,23.75
2,ALICE-SPRINGS-AIRPORT,15590,-23.8,133.89,38.4,37.32,35.35,31.37,25.18,21.06,20.52,23.71,29.07,31.41,34.38,36.06
3,AMBERLEY-AMO,40004,-27.63,152.71,32.9,31.87,31.21,29.2,26.06,23.38,22.88,24.12,27.58,29.5,31.04,32.28
4,BARCALDINE-POST-OFFICE,36007,-23.55,145.29,38.03,36.21,35.41,31.73,27.46,25.21,24.64,26.46,30.75,34.13,35.93,37.41


### Validation of number of files loaded

In [30]:
N_documents = len(dataframes)
N_register = df.shape[0]
num_stations=len(df["STATION_NAME"].unique())
documents2=N_register / num_stations
documents2
print(f"we loaded {N_documents} documents for a total of {num_stations} stations.This is teh information of {documents2} years")

we loaded 20 documents for a total of 112 stations.This is teh information of 20.0 years


### Normalization of table
The datasets are not normalized, this means that informations that must be one variable are orginized as columns. This doesn't let work with the data in a right way. Because of taht we will transform the columns January, February, March ... in just one column named month. This will let as work and apply tata analysis formulas in a right way.

In [31]:
dfn = pd.melt(
    df,
    id_vars= ["STATION_NAME", "STN_ID", "LAT", "LON"],
    value_vars=["January", "February", "March","April", "May", "June", "July", "August", "September", "October", "November", "December"],
    var_name="month",
    value_name="Temp"
)
dfn.head()

Unnamed: 0,STATION_NAME,STN_ID,LAT,LON,month,Temp
0,ADELAIDE-KENT-TOWN,23090,-34.92,138.62,January,31.48
1,ALBANY-AIRPORT-COMPARISON,9741,-34.94,117.8,January,25.24
2,ALICE-SPRINGS-AIRPORT,15590,-23.8,133.89,January,38.4
3,AMBERLEY-AMO,40004,-27.63,152.71,January,32.9
4,BARCALDINE-POST-OFFICE,36007,-23.55,145.29,January,38.03


### Seasons
Classify months into seasons


In [32]:
season = {
    "January" : "Summer", 
    "February" : "Summer", 
    "March" : "Autumn",
    "April" : "Autumn", 
    "May" : "Autumn", 
    "June" : "Winter", 
    "July" : "Winter", 
    "August" : "Winter", 
    "September" : "Spring", 
    "October" : "Spring", 
    "November" : "Spring", 
    "December" : "Summer"
}

dfn["SEASON"] = dfn["month"].map(season).fillna("unclassified")
dfn.head(10)

Unnamed: 0,STATION_NAME,STN_ID,LAT,LON,month,Temp,SEASON
0,ADELAIDE-KENT-TOWN,23090,-34.92,138.62,January,31.48,Summer
1,ALBANY-AIRPORT-COMPARISON,9741,-34.94,117.8,January,25.24,Summer
2,ALICE-SPRINGS-AIRPORT,15590,-23.8,133.89,January,38.4,Summer
3,AMBERLEY-AMO,40004,-27.63,152.71,January,32.9,Summer
4,BARCALDINE-POST-OFFICE,36007,-23.55,145.29,January,38.03,Summer
5,BATHURST-AGRICULTURAL-STATION,63005,-33.43,149.56,January,30.21,Summer
6,BIRDSVILLE-AIRPORT,38026,-25.9,139.35,January,40.88,Summer
7,BOULIA-AIRPORT,38003,-22.91,139.9,January,43.31,Summer
8,BOURKE-AIRPORT-AWS,48245,-30.04,145.95,January,38.01,Summer
9,BRIDGETOWN-COMPARISON,9510,-33.96,116.14,January,30.34,Summer


In [33]:
dfn_AVG_TEMP_SEASON = dfn.groupby("SEASON")["Temp"].mean().round(1)
dfn_AVG_TEMP_SEASON

SEASON
Autumn    27.3
Spring    27.4
Summer    32.1
Winter    21.1
Name: Temp, dtype: float64

In [34]:
file_name = "average_temp.txt"

if os.path.exists(file_name):
    os.remove(file_name)
    
with open("average_temp.txt", "w") as f:
    f.write("Season\tTemperature\n")
    for season, avg in dfn_AVG_TEMP_SEASON.items():
        f.write(f"{season}: {avg}°C\n")

print("txt created successfully")
        

txt created successfully


### Temperature Range

In [35]:
dfn_TEMP_RANGE = dfn.groupby("STATION_NAME")["Temp"].agg(
    Temp_min="min",
    Temp_max="max",
    Temp_avg="mean"
).reset_index()

dfn_TEMP_RANGE["Temp_Range"] = (dfn_TEMP_RANGE["Temp_max"] - dfn_TEMP_RANGE["Temp_min"])

dfn_TEMP_RANGE = dfn_TEMP_RANGE.sort_values(by="Temp_Range",ascending=False)
#dfn_TEMP_RANGE
MAX_RANGE= dfn_TEMP_RANGE["Temp_Range"].max()
#MAX_RANGE
STATION_MAX_RANGE = dfn_TEMP_RANGE[dfn_TEMP_RANGE["Temp_Range"] == MAX_RANGE]

file_name = "largest_temp_range_station.txt"

if os.path.exists(file_name):
    os.remove(file_name)

with open(file_name, "w") as f:
    for _, row in STATION_MAX_RANGE.iterrows():
        f.write(f"Station {row['STATION_NAME']}: Range {round(row['Temp_Range'],1)}°C (Max: {row['Temp_max']}°C, Min: {row['Temp_min']}°C)\n")

print("txt created successfully")

txt created successfully


### Temperature Stability

In [36]:
dfn_std = dfn.groupby("STATION_NAME")["Temp"].agg(
    STD = "std"
).reset_index().round(3)

MIN_STD = dfn_std["STD"].min()
MAX_STD = dfn_std["STD"].max()
STATION_MAX_MIN_STD = dfn_std[(dfn_std["STD"] == MIN_STD) | (dfn_std["STD"] == MAX_STD)]
STATION_MAX_MIN_STD

file_name = "temperature_stability_stations.txt"

#Remove the file if it exists
if os.path.exists(file_name):
    os.remove(file_name)

# Write the results to a new file
with open(file_name, "w") as f:
    # most stable station(Min STD)
    min_std = STATION_MAX_MIN_STD["STD"].min()
    for _, row in STATION_MAX_MIN_STD[STATION_MAX_MIN_STD["STD"] == min_std].iterrows():
        f.write(f"Most Stable: Station {row['STATION_NAME']}: StdDev {row['STD']}°C\n")
    
    # Less Stable Stations (Max STD)
    max_std = STATION_MAX_MIN_STD["STD"].max()
    for _, row in STATION_MAX_MIN_STD[STATION_MAX_MIN_STD["STD"] == max_std].iterrows():
        f.write(f"Most Variable: Station {row['STATION_NAME']}: StdDev {row['STD']}°C\n")

print("txt created successfully")

txt created successfully
