## Import des librairies

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, accuracy_score, auc, roc_curve, confusion_matrix
from xgboost import XGBClassifier

In [2]:
data=pd.read_csv('weatherAUS.csv')
#On supprime du Dataframe les lignes où RainTomorrow est nul
liste=data.index[data['RainTomorrow'].isnull()]
data.drop(liste , inplace=True)

In [3]:
data.head(5)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [4]:
#On convertit le type de la variable Date
data['Date']=data['Date'].astype('datetime64')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142193 entries, 0 to 145458
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Date           142193 non-null  datetime64[ns]
 1   Location       142193 non-null  object        
 2   MinTemp        141556 non-null  float64       
 3   MaxTemp        141871 non-null  float64       
 4   Rainfall       140787 non-null  float64       
 5   Evaporation    81350 non-null   float64       
 6   Sunshine       74377 non-null   float64       
 7   WindGustDir    132863 non-null  object        
 8   WindGustSpeed  132923 non-null  float64       
 9   WindDir9am     132180 non-null  object        
 10  WindDir3pm     138415 non-null  object        
 11  WindSpeed9am   140845 non-null  float64       
 12  WindSpeed3pm   139563 non-null  float64       
 13  Humidity9am    140419 non-null  float64       
 14  Humidity3pm    138583 non-null  float64       
 15  

In [6]:
la = LabelEncoder()
data['RainToday'] = la.fit_transform(data['RainToday'])
data['RainTomorrow'] = la.fit_transform(data['RainTomorrow'])

# Data cleaning

In [7]:
data = pd.get_dummies(data)

In [8]:
data.fillna(data.mean(), inplace=True)

  data.fillna(data.mean(), inplace=True)


# Selection des features et entraînement du modèle

In [9]:
cor = data.corr()
cor2 = data.corr()
cor = np.where(cor <0,cor*-1 , cor)
cor=pd.DataFrame(cor)
cor.columns=cor2.columns
cor.index =cor2.index 

In [10]:
cor

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
MinTemp,1.000000,0.733919,0.103314,0.353698,0.051907,0.173331,0.174946,0.174187,0.232372,0.005913,...,0.070060,0.063042,0.008691,0.075290,0.023394,0.036248,0.044606,0.077095,0.046426,0.059027
MaxTemp,0.733919,1.000000,0.074202,0.442214,0.331505,0.066329,0.014586,0.050381,0.499777,0.499725,...,0.008435,0.008305,0.056961,0.034013,0.013940,0.061899,0.054166,0.068534,0.026008,0.060974
Rainfall,0.103314,0.074202,1.000000,0.038186,0.173011,0.127250,0.085977,0.056762,0.221392,0.249609,...,0.017905,0.005457,0.019474,0.004469,0.013165,0.026983,0.008769,0.015170,0.008546,0.013863
Evaporation,0.353698,0.442214,0.038186,1.000000,0.290375,0.147371,0.140669,0.094134,0.377594,0.289915,...,0.028171,0.019473,0.010434,0.024236,0.016020,0.006374,0.003466,0.024829,0.024452,0.009847
Sunshine,0.051907,0.331505,0.173011,0.290375,1.000000,0.022479,0.005685,0.039471,0.351980,0.446849,...,0.017130,0.009510,0.030265,0.014485,0.012853,0.017688,0.019227,0.013988,0.011490,0.007388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WindDir3pm_SSW,0.036248,0.061899,0.026983,0.006374,0.017688,0.003485,0.012755,0.005263,0.027415,0.028423,...,0.058593,0.061483,0.065735,0.069566,0.064044,1.000000,0.064194,0.066877,0.062205,0.064741
WindDir3pm_SW,0.044606,0.054166,0.008769,0.003466,0.019227,0.006677,0.002684,0.007959,0.006323,0.007170,...,0.063009,0.066116,0.070689,0.074809,0.068871,0.064194,1.000000,0.071917,0.066893,0.069621
WindDir3pm_W,0.077095,0.068534,0.015170,0.024829,0.013988,0.089157,0.028580,0.060929,0.000414,0.020193,...,0.065643,0.068880,0.073644,0.077936,0.071750,0.066877,0.071917,1.000000,0.069689,0.072531
WindDir3pm_WNW,0.046426,0.026008,0.008546,0.024452,0.011490,0.084918,0.014555,0.054412,0.001658,0.035298,...,0.061057,0.064068,0.068499,0.072491,0.066737,0.062205,0.066893,0.069689,1.000000,0.067464


In [11]:
related = cor['RainTomorrow'].sort_values(ascending = False).head(30)
x = []
for i in range (len(related)):
    x.append(related.index[i])
x

['RainTomorrow',
 'Humidity3pm',
 'Sunshine',
 'RainToday',
 'Cloud3pm',
 'Humidity9am',
 'Cloud9am',
 'Rainfall',
 'Pressure9am',
 'WindGustSpeed',
 'Pressure3pm',
 'Temp3pm',
 'MaxTemp',
 'WindSpeed9am',
 'Evaporation',
 'WindSpeed3pm',
 'MinTemp',
 'WindDir9am_N',
 'Location_Woomera',
 'Location_AliceSprings',
 'Location_Portland',
 'WindDir9am_NNW',
 'WindDir9am_E',
 'WindGustDir_E',
 'WindDir9am_ESE',
 'Location_Mildura',
 'WindDir9am_SE',
 'Location_Walpole',
 'Location_Uluru',
 'WindGustDir_ENE']

In [12]:
x = data[x]
y = data['RainTomorrow']
x.drop('RainTomorrow', inplace = True, axis = 1)
x

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Humidity3pm,Sunshine,RainToday,Cloud3pm,Humidity9am,Cloud9am,Rainfall,Pressure9am,WindGustSpeed,Pressure3pm,...,Location_Portland,WindDir9am_NNW,WindDir9am_E,WindGustDir_E,WindDir9am_ESE,Location_Mildura,WindDir9am_SE,Location_Walpole,Location_Uluru,WindGustDir_ENE
0,22.0,7.624853,0,4.503167,71.0,8.000000,0.6,1007.7,44.0,1007.1,...,0,0,0,0,0,0,0,0,0,0
1,25.0,7.624853,0,4.503167,44.0,4.437189,0.0,1010.6,44.0,1007.8,...,0,1,0,0,0,0,0,0,0,0
2,30.0,7.624853,0,2.000000,38.0,4.437189,0.0,1007.6,46.0,1008.7,...,0,0,0,0,0,0,0,0,0,0
3,16.0,7.624853,0,4.503167,45.0,4.437189,0.0,1017.6,24.0,1012.8,...,0,0,0,0,0,0,1,0,0,0
4,33.0,7.624853,0,8.000000,82.0,7.000000,1.0,1010.8,41.0,1006.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145454,27.0,7.624853,0,4.503167,59.0,4.437189,0.0,1024.7,31.0,1021.2,...,0,0,0,1,1,0,0,0,1,0
145455,24.0,7.624853,0,4.503167,51.0,4.437189,0.0,1024.6,31.0,1020.3,...,0,0,0,1,0,0,1,0,1,0
145456,21.0,7.624853,0,4.503167,56.0,4.437189,0.0,1023.5,22.0,1019.1,...,0,0,0,0,0,0,1,0,1,0
145457,24.0,7.624853,0,4.503167,53.0,4.437189,0.0,1021.0,37.0,1016.8,...,0,0,0,0,0,0,1,0,1,0


In [13]:
y.value_counts()

0    110316
1     31877
Name: RainTomorrow, dtype: int64

In [14]:
bal = SMOTE()
x, y = bal.fit_resample(x, y)
y.value_counts()

0    110316
1    110316
Name: RainTomorrow, dtype: int64

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

In [16]:
rf = RandomForestClassifier(n_estimators = 200)
rf.fit(x_train, y_train)

RandomForestClassifier(n_estimators=200)

# Evaluation du modèle

In [17]:
predicted = rf.predict(x_test)
print("La précision est de {} %".format(accuracy_score(predicted, y_test)*100))

La précision est de 90.25766537494052 %


In [18]:
pd.crosstab(predicted, y_test)

RainTomorrow,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,19955,2182
1,2117,19873
