In [4]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)  # Shows all columns

In [5]:
df = pd.read_csv('weatherAUS.csv')

In [6]:
print(df.head(5))

         Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0  2008-12-01   Albury     13.4     22.9       0.6          NaN       NaN   
1  2008-12-02   Albury      7.4     25.1       0.0          NaN       NaN   
2  2008-12-03   Albury     12.9     25.7       0.0          NaN       NaN   
3  2008-12-04   Albury      9.2     28.0       0.0          NaN       NaN   
4  2008-12-05   Albury     17.5     32.3       1.0          NaN       NaN   

  WindGustDir  WindGustSpeed WindDir9am WindDir3pm  WindSpeed9am  \
0           W           44.0          W        WNW          20.0   
1         WNW           44.0        NNW        WSW           4.0   
2         WSW           46.0          W        WSW          19.0   
3          NE           24.0         SE          E          11.0   
4           W           41.0        ENE         NW           7.0   

   WindSpeed3pm  Humidity9am  Humidity3pm  Pressure9am  Pressure3pm  Cloud9am  \
0          24.0         71.0         22.0      

In [7]:
print(len(df))

# Convert 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

# Sort by 'Location' and 'Date'
df.sort_values(by=['Location', 'Date'], inplace=True)

# Shift the 'MaxTemp' column to get the temperature of the next day
df['NextDayMaxTemp'] = df.groupby('Location')['MaxTemp'].shift(-1)

# Display the resulting DataFrame
print(df.head())

145460
            Date  Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
96320 2008-07-01  Adelaide      8.8     15.7       5.0          1.6       2.6   
96321 2008-07-02  Adelaide     12.7     15.8       0.8          1.4       7.8   
96322 2008-07-03  Adelaide      6.2     15.1       0.0          1.8       2.1   
96323 2008-07-04  Adelaide      5.3     15.9       0.0          1.4       8.0   
96324 2008-07-05  Adelaide      9.8     15.4       0.0          NaN       0.9   

      WindGustDir  WindGustSpeed WindDir9am WindDir3pm  WindSpeed9am  \
96320          NW           48.0         SW          W          13.0   
96321          SW           35.0        SSW         SW          13.0   
96322           W           20.0        NNE         SW           2.0   
96323         NNE           30.0        NNE         NE           6.0   
96324           N           30.0        NNE         NE           9.0   

       WindSpeed3pm  Humidity9am  Humidity3pm  Pressure9am  Pressure3pm  

In [8]:
print(df.columns)
df = df.drop('Date', axis=1)
df = df.drop('RainToday', axis=1)
df = df.drop('RainTomorrow', axis=1)
print(df.columns)

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow', 'NextDayMaxTemp'],
      dtype='object')
Index(['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'NextDayMaxTemp'],
      dtype='object')


In [9]:
print(len(df))
df = df.dropna()
print(len(df))

145460
56530


In [10]:
print(df.head(5))

            Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
136071  AliceSprings     19.6     37.6       0.0         11.2       9.8   
136072  AliceSprings     21.0     39.1       1.2          9.0      12.2   
136073  AliceSprings     22.9     40.9       0.0         11.6      12.6   
136074  AliceSprings     24.7     40.5       0.0         16.0       7.8   
136075  AliceSprings     23.4     32.4       0.2         12.2       4.1   

       WindGustDir  WindGustSpeed WindDir9am WindDir3pm  WindSpeed9am  \
136071         WNW           87.0        NNE         NE          20.0   
136072         NNW           41.0        NNW          S          24.0   
136073         NNW           48.0        ENE          N           7.0   
136074         WNW           72.0        SSW          W           2.0   
136075         SSW           46.0          S          S           9.0   

        WindSpeed3pm  Humidity9am  Humidity3pm  Pressure9am  Pressure3pm  \
136071          11.0         17.0 

In [11]:
print(df['Location'].nunique())
print(df['WindGustDir'].nunique())

26
16


In [12]:
# Function to convert wind direction to angle
def direction_to_angle(direction):
    directions = ['N', 'NNE', 'NE', 'ENE', 'E', 'ESE', 'SE', 'SSE', 'S', 'SSW', 'SW', 'WSW', 'W', 'WNW', 'NW', 'NNW']
    return directions.index(direction) * 360 / len(directions)

columns_that_contain_direction = [col for col in df.columns if 'Dir' in col]

for col in columns_that_contain_direction:
    radians = np.deg2rad(df[col].apply(direction_to_angle))
    df[col + '_x'] = np.cos(radians)
    df[col + '_y'] = np.sin(radians)
    # drop the original column
    df = df.drop(col, axis=1)

print(df.columns)

Index(['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm', 'NextDayMaxTemp', 'WindGustDir_x',
       'WindGustDir_y', 'WindDir9am_x', 'WindDir9am_y', 'WindDir3pm_x',
       'WindDir3pm_y'],
      dtype='object')


In [13]:
print(df.head(5))

            Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
136071  AliceSprings     19.6     37.6       0.0         11.2       9.8   
136072  AliceSprings     21.0     39.1       1.2          9.0      12.2   
136073  AliceSprings     22.9     40.9       0.0         11.6      12.6   
136074  AliceSprings     24.7     40.5       0.0         16.0       7.8   
136075  AliceSprings     23.4     32.4       0.2         12.2       4.1   

        WindGustSpeed  WindSpeed9am  WindSpeed3pm  Humidity9am  Humidity3pm  \
136071           87.0          20.0          11.0         17.0         16.0   
136072           41.0          24.0           9.0         18.0         13.0   
136073           48.0           7.0          24.0         17.0         11.0   
136074           72.0           2.0          22.0         29.0         24.0   
136075           46.0           9.0          17.0         58.0         43.0   

        Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  \


In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
df["Location"] = LabelEncoder().fit_transform(df["Location"])

In [16]:
# print(df.head(5))
print(df["Location"].nunique())

26


In [17]:
# Save the dataframe to a csv file
df.to_csv('weatherAUS_processed.csv', index=False)

NameError: name 'df' is not defined