In [59]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import style
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer 
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [60]:
weather_full_data = pd.read_csv("weatherAUS.csv")

In [61]:
weather_full_data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [62]:
weather_full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142193 entries, 0 to 142192
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           142193 non-null  object 
 1   Location       142193 non-null  object 
 2   MinTemp        141556 non-null  float64
 3   MaxTemp        141871 non-null  float64
 4   Rainfall       140787 non-null  float64
 5   Evaporation    81350 non-null   float64
 6   Sunshine       74377 non-null   float64
 7   WindGustDir    132863 non-null  object 
 8   WindGustSpeed  132923 non-null  float64
 9   WindDir9am     132180 non-null  object 
 10  WindDir3pm     138415 non-null  object 
 11  WindSpeed9am   140845 non-null  float64
 12  WindSpeed3pm   139563 non-null  float64
 13  Humidity9am    140419 non-null  float64
 14  Humidity3pm    138583 non-null  float64
 15  Pressure9am    128179 non-null  float64
 16  Pressure3pm    128212 non-null  float64
 17  Cloud9am       88536 non-null

In [63]:
#Dropping the columns which will make us do the most work : ie: the date {in string format} and the columns with a lot of null values 
weather_half_full_data = weather_full_data.drop(columns = ["Date", "Evaporation", "Sunshine", "Cloud9am", "Cloud3pm"])

In [64]:
print(weather_half_full_data.head())
weather_half_full_data.shape

  Location  MinTemp  MaxTemp  Rainfall  ... Temp9am  Temp3pm RainToday RainTomorrow
0   Albury     13.4     22.9       0.6  ...    16.9     21.8        No           No
1   Albury      7.4     25.1       0.0  ...    17.2     24.3        No           No
2   Albury     12.9     25.7       0.0  ...    21.0     23.2        No           No
3   Albury      9.2     28.0       0.0  ...    18.1     26.5        No           No
4   Albury     17.5     32.3       1.0  ...    17.8     29.7        No           No

[5 rows x 18 columns]


(142193, 18)

In [65]:
#Dropping all null values and establishing the dataset we'll work with
weather_data = weather_half_full_data.dropna()
print(weather_data)
weather_data.shape

       Location  MinTemp  MaxTemp  ...  Temp3pm RainToday  RainTomorrow
0        Albury     13.4     22.9  ...     21.8        No            No
1        Albury      7.4     25.1  ...     24.3        No            No
2        Albury     12.9     25.7  ...     23.2        No            No
3        Albury      9.2     28.0  ...     26.5        No            No
4        Albury     17.5     32.3  ...     29.7        No            No
...         ...      ...      ...  ...      ...       ...           ...
142188    Uluru      3.5     21.8  ...     20.9        No            No
142189    Uluru      2.8     23.4  ...     22.4        No            No
142190    Uluru      3.6     25.3  ...     24.5        No            No
142191    Uluru      5.4     26.9  ...     26.1        No            No
142192    Uluru      7.8     27.0  ...     26.0        No            No

[112925 rows x 18 columns]


(112925, 18)

In [66]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112925 entries, 0 to 142192
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Location       112925 non-null  object 
 1   MinTemp        112925 non-null  float64
 2   MaxTemp        112925 non-null  float64
 3   Rainfall       112925 non-null  float64
 4   WindGustDir    112925 non-null  object 
 5   WindGustSpeed  112925 non-null  float64
 6   WindDir9am     112925 non-null  object 
 7   WindDir3pm     112925 non-null  object 
 8   WindSpeed9am   112925 non-null  float64
 9   WindSpeed3pm   112925 non-null  float64
 10  Humidity9am    112925 non-null  float64
 11  Humidity3pm    112925 non-null  float64
 12  Pressure9am    112925 non-null  float64
 13  Pressure3pm    112925 non-null  float64
 14  Temp9am        112925 non-null  float64
 15  Temp3pm        112925 non-null  float64
 16  RainToday      112925 non-null  object 
 17  RainTomorrow   112925 non-nul

In [67]:
#Making the "Yes" and "No" 
label = LabelEncoder()

In [68]:
weather_data["RainToday"] = label.fit_transform(weather_data["RainToday"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [69]:
weather_data.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,Albury,13.4,22.9,0.6,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,16.9,21.8,0,No
1,Albury,7.4,25.1,0.0,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,17.2,24.3,0,No
2,Albury,12.9,25.7,0.0,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,21.0,23.2,0,No
3,Albury,9.2,28.0,0.0,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,18.1,26.5,0,No
4,Albury,17.5,32.3,1.0,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,17.8,29.7,0,No


In [70]:
weather_data["RainTomorrow"] = label.fit_transform(weather_data["RainTomorrow"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [71]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112925 entries, 0 to 142192
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Location       112925 non-null  object 
 1   MinTemp        112925 non-null  float64
 2   MaxTemp        112925 non-null  float64
 3   Rainfall       112925 non-null  float64
 4   WindGustDir    112925 non-null  object 
 5   WindGustSpeed  112925 non-null  float64
 6   WindDir9am     112925 non-null  object 
 7   WindDir3pm     112925 non-null  object 
 8   WindSpeed9am   112925 non-null  float64
 9   WindSpeed3pm   112925 non-null  float64
 10  Humidity9am    112925 non-null  float64
 11  Humidity3pm    112925 non-null  float64
 12  Pressure9am    112925 non-null  float64
 13  Pressure3pm    112925 non-null  float64
 14  Temp9am        112925 non-null  float64
 15  Temp3pm        112925 non-null  float64
 16  RainToday      112925 non-null  int64  
 17  RainTomorrow   112925 non-nul

In [72]:
#Turning objects into categorical data
to_be_encoded = weather_data[["Location", "WindGustDir", "WindDir9am", "WindDir3pm"]].copy()
to_be_encoded

Unnamed: 0,Location,WindGustDir,WindDir9am,WindDir3pm
0,Albury,W,W,WNW
1,Albury,WNW,NNW,WSW
2,Albury,WSW,W,WSW
3,Albury,NE,SE,E
4,Albury,W,ENE,NW
...,...,...,...,...
142188,Uluru,E,ESE,E
142189,Uluru,E,SE,ENE
142190,Uluru,NNW,SE,N
142191,Uluru,N,SE,WNW


In [73]:
weather_dummies = pd.get_dummies(to_be_encoded)

In [74]:
weather_dummies.shape

(112925, 92)

In [75]:
weather_data = weather_data = weather_data.drop(["Location", "WindGustDir", "WindDir9am", "WindDir3pm"], axis = 1)

In [76]:
weather_data.shape

(112925, 14)

In [91]:
weather_data = pd.concat([weather_data, weather_dummies], axis = 1).reset_index(drop=True)

In [93]:
weather_data.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Location_Adelaide,Location_Albury,Location_AliceSprings,Location_BadgerysCreek,Location_Ballarat,Location_Bendigo,Location_Brisbane,Location_Cairns,Location_Canberra,Location_Cobar,Location_CoffsHarbour,Location_Dartmoor,Location_Darwin,Location_GoldCoast,Location_Hobart,Location_Katherine,Location_Launceston,Location_Melbourne,Location_MelbourneAirport,Location_Mildura,Location_Moree,Location_MountGambier,Location_Nhil,Location_NorahHead,Location_NorfolkIsland,Location_Nuriootpa,...,WindGustDir_S,WindGustDir_SE,WindGustDir_SSE,WindGustDir_SSW,WindGustDir_SW,WindGustDir_W,WindGustDir_WNW,WindGustDir_WSW,WindDir9am_E,WindDir9am_ENE,WindDir9am_ESE,WindDir9am_N,WindDir9am_NE,WindDir9am_NNE,WindDir9am_NNW,WindDir9am_NW,WindDir9am_S,WindDir9am_SE,WindDir9am_SSE,WindDir9am_SSW,WindDir9am_SW,WindDir9am_W,WindDir9am_WNW,WindDir9am_WSW,WindDir3pm_E,WindDir3pm_ENE,WindDir3pm_ESE,WindDir3pm_N,WindDir3pm_NE,WindDir3pm_NNE,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,13.4,22.9,0.6,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,16.9,21.8,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,7.4,25.1,0.0,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,17.2,24.3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,12.9,25.7,0.0,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,21.0,23.2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,9.2,28.0,0.0,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,18.1,26.5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,17.8,29.7,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [94]:
#Scaling the other values
scaler = StandardScaler()

In [95]:
to_be_scaled = weather_data[["MinTemp", "MaxTemp", "Rainfall", "WindGustSpeed", "WindSpeed9am", "WindSpeed3pm", "Humidity9am", "Humidity3pm", "Pressure9am", "Pressure3pm", "Temp9am", "Temp3pm"]].copy()

In [96]:
scaled_array = scaler.fit_transform(to_be_scaled)

In [97]:
scaled = pd.DataFrame(scaled_array, columns = ["MinTemp_s", "MaxTemp_s", "Rainfall_s", "WindGustSpeed_s", "WindSpeed9am_s", "WindSpeed3pm_s", "Humidity9am_s", "Humidity3pm_s", "Pressure9am_s", "Pressure3pm_s", "Temp9am_s", "Temp3pm_s"])

In [98]:
scaled.head()

Unnamed: 0,MinTemp_s,MaxTemp_s,Rainfall_s,WindGustSpeed_s,WindSpeed9am_s,WindSpeed3pm_s,Humidity9am_s,Humidity3pm_s,Pressure9am_s,Pressure3pm_s,Temp9am_s,Temp3pm_s
0,0.117567,-0.108221,-0.206661,0.241214,0.577742,0.524408,0.19014,-1.380413,-1.382962,-1.142455,-0.088435,-0.04787
1,-0.841802,0.206845,-0.276405,0.241214,-1.339742,0.29131,-1.237561,-1.235963,-0.970598,-1.041848,-0.041228,0.317768
2,0.03762,0.292772,-0.276405,0.391345,0.4579,0.757507,-1.554828,-0.995214,-1.397181,-0.912497,0.556724,0.156887
3,-0.553991,0.622159,-0.276405,-1.260094,-0.500842,-1.223831,-1.184683,-1.669313,0.024764,-0.323229,0.100392,0.639531
4,0.773137,1.237969,-0.160165,0.016018,-0.980214,0.058211,0.771796,-0.850764,-0.942159,-1.300551,0.053185,1.107548


In [99]:
weather_data = pd.concat([weather_data, scaled], axis = 1).reset_index(drop = True)

In [100]:
weather_data.shape

(112925, 118)

In [101]:
weather_data = weather_data.drop(["MinTemp", "MaxTemp", "Rainfall", "WindGustSpeed", "WindSpeed9am", "WindSpeed3pm", "Humidity9am", "Humidity3pm", "Pressure9am", "Pressure3pm", "Temp9am", "Temp3pm"], axis = 1)

In [102]:
weather_data.shape

(112925, 106)

In [103]:
X = weather_data.drop(["RainTomorrow"], axis = 1)
y = weather_data["RainTomorrow"]

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [105]:
print("My shapes are : ", "\n"
      "X_train : ", X_train.shape, "\n"
      "X_test : ", X_test.shape, "\n"
      "y_train : ", y_train.shape, "\n"
      "y_test : ", y_test.shape)

My shapes are :  
X_train :  (90340, 105) 
X_test :  (22585, 105) 
y_train :  (90340,) 
y_test :  (22585,)


# Time for my deep learning model!

In [106]:
#Defining a function for my perceptron
def perceptron(n_params):
    #Model Instanciation
    model = Sequential()
    
    #Layer arc

    model.add(Dense(1, input_dim = n_params, activation = 'sigmoid'))

    # return the created model
    return model 

In [107]:
my_perp = perceptron(X_train.shape[-1])
my_perp.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
fitted_perp = my_perp.fit(X_train, y_train, epochs = 10, validation_data = (X_test, y_test), batch_size = 18)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [110]:
def mlp(n_params):
  model = Sequential()

  #Couches cachees
  model.add(Dense(4, input_dim = n_params, activation = 'tanh'))
  model.add(Dense(3, activation = 'relu'), kernel_regularizer=tf.keras.regularizers.l2(0.01))
  model.add(Dense(4, activation = 'relu'))

  #Couche de sortie
  model.add(Dense(1, activation = 'sigmoid'))

  return model

In [111]:
my_mlp = perceptron(X_train.shape[-1])
my_mlp.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
fitted_mult_perp = my_mlp.fit(X_train, y_train, epochs = 10, validation_data = (X_test, y_test), batch_size = 18)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [115]:
my_perp.predict(X_test[0:1])

array([[0.01089031]], dtype=float32)

In [117]:
y_test[0:1]

83764    0
Name: RainTomorrow, dtype: int64

In [116]:
my_mlp.predict(X_test[0:1])

array([[0.01126412]], dtype=float32)

In [118]:
y_test[0:1]

83764    0
Name: RainTomorrow, dtype: int64