In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
weather = pd.read_csv(r'weather.csv', index_col=0) # index_col=0:第一列为索引值

In [3]:
weather.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,24.0,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,22.0,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,26.0,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,9.0,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No


In [6]:
# 将特征矩阵和标签分开
X = weather.iloc[:, :-1]
Y = weather.iloc[:, -1]

In [7]:
X.shape

(142193, 21)

In [8]:
Y.shape

(142193,)

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142193 entries, 0 to 142192
Data columns (total 21 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           142193 non-null  object 
 1   Location       142193 non-null  object 
 2   MinTemp        141556 non-null  float64
 3   MaxTemp        141871 non-null  float64
 4   Rainfall       140787 non-null  float64
 5   Evaporation    81350 non-null   float64
 6   Sunshine       74377 non-null   float64
 7   WindGustDir    132863 non-null  object 
 8   WindGustSpeed  132923 non-null  float64
 9   WindDir9am     132180 non-null  object 
 10  WindDir3pm     138415 non-null  object 
 11  WindSpeed9am   140845 non-null  float64
 12  WindSpeed3pm   139563 non-null  float64
 13  Humidity9am    140419 non-null  float64
 14  Humidity3pm    138583 non-null  float64
 15  Pressure9am    128179 non-null  float64
 16  Pressure3pm    128212 non-null  float64
 17  Cloud9am       88536 non-null

In [21]:
X.isnull().mean() # 缺失值

Date             0.000000
Location         0.000000
MinTemp          0.004480
MaxTemp          0.002265
Rainfall         0.009888
Evaporation      0.427890
Sunshine         0.476929
WindGustDir      0.065615
WindGustSpeed    0.065193
WindDir9am       0.070418
WindDir3pm       0.026570
WindSpeed9am     0.009480
WindSpeed3pm     0.018496
Humidity9am      0.012476
Humidity3pm      0.025388
Pressure9am      0.098556
Pressure3pm      0.098324
Cloud9am         0.377353
Cloud3pm         0.401525
Temp9am          0.006358
Temp3pm          0.019171
dtype: float64

In [22]:
Y.isnull().mean()

0.0

In [11]:
np.unique(Y) # 探索标签的分类

array(['No', 'Yes'], dtype=object)

In [12]:
# 将数据分为训练集和测试集
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=420)
for i in [Xtrain, Xtest, Ytrain, Ytest]: # 恢复索引
    i.index = range(i.shape[0])

In [16]:
# 样本不平衡问题探索
Ytrain.value_counts()

No     77238
Yes    22297
Name: RainTomorrow, dtype: int64

In [17]:
Ytest.value_counts()

No     33078
Yes     9580
Name: RainTomorrow, dtype: int64

In [18]:
# 将标签进行编码
from sklearn.preprocessing import LabelEncoder
encorder = LabelEncoder().fit(Ytrain) 
Ytrain = pd.DataFrame(encorder.transform(Ytrain))
Ytest = pd.DataFrame(encorder.transform(Ytest))

In [19]:
np.unique(Ytrain)

array([0, 1])

In [20]:
np.unique(Ytest)

array([0, 1])

In [25]:
# 描述性统计与异常值
Xtrain.describe([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,99%,max
MinTemp,99092.0,12.179886,6.404463,-8.5,-1.809,1.8,3.9,7.6,12.0,16.8,20.8,25.8,31.9
MaxTemp,99292.0,23.212779,7.110535,-4.1,9.1,12.8,14.5,17.9,22.6,28.2,32.9,40.1,48.1
Rainfall,98535.0,2.348739,8.438048,0.0,0.0,0.0,0.0,0.0,0.0,0.6,6.0,37.4,367.6
Evaporation,56898.0,5.455923,4.164123,0.0,0.4,1.0,1.4,2.6,4.8,7.4,10.2,18.206,86.2
Sunshine,51968.0,7.609452,3.782961,0.0,0.0,0.3,1.5,4.8,8.4,10.6,12.0,13.4,14.3
WindGustSpeed,93045.0,39.988285,13.624007,6.0,15.0,20.0,24.0,31.0,39.0,48.0,57.0,81.0,135.0
WindSpeed9am,98591.0,14.006319,8.922327,0.0,0.0,0.0,4.0,7.0,13.0,19.0,26.0,39.0,130.0
WindSpeed3pm,97698.0,18.633841,8.814635,0.0,2.0,6.0,9.0,13.0,19.0,24.0,30.0,43.0,83.0
Humidity9am,98284.0,68.845265,19.07418,0.0,17.0,34.0,44.0,57.0,70.0,83.0,94.0,100.0,100.0
Humidity3pm,96953.0,51.497798,20.776461,0.0,9.0,17.0,23.0,37.0,52.0,66.0,79.0,98.0,100.0


In [26]:
Xtest.describe([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,99%,max
MinTemp,42464.0,12.201599,6.400577,-7.6,-1.8,1.8,4.0,7.6,12.0,16.9,20.8,25.8,33.9
MaxTemp,42579.0,23.259442,7.134084,-4.8,9.0,12.8,14.5,18.0,22.7,28.2,33.0,40.3,47.0
Rainfall,42252.0,2.352854,8.528195,0.0,0.0,0.0,0.0,0.0,0.0,0.8,6.0,37.498,371.0
Evaporation,24452.0,5.502172,4.24471,0.0,0.4,1.0,1.4,2.6,4.8,7.4,10.2,18.6,145.0
Sunshine,22409.0,7.660569,3.778036,0.0,0.0,0.4,1.6,4.9,8.5,10.7,12.1,13.4,14.5
WindGustSpeed,39878.0,39.974974,13.506464,7.0,15.0,20.0,24.0,31.0,39.0,48.0,57.0,80.0,135.0
WindSpeed9am,42254.0,13.991882,8.825421,0.0,0.0,0.0,4.0,7.0,13.0,19.0,26.0,39.0,87.0
WindSpeed3pm,41865.0,18.646292,8.777042,0.0,2.0,6.0,9.0,13.0,19.0,24.0,30.0,43.0,87.0
Humidity9am,42135.0,68.840418,18.998023,3.0,18.0,34.0,44.0,57.0,70.0,83.0,94.0,100.0,100.0
Humidity3pm,41630.0,51.447226,20.847527,1.0,9.0,16.0,23.0,36.0,52.0,66.0,79.0,97.0,100.0


In [34]:
Xtrain.shape

(99535, 21)

In [28]:
Ytrain.shape

(99535, 1)

In [35]:
# 处理日期特征
Xtrainc = Xtrain.copy()

In [37]:
Xtrainc.sort_values(by='Location')

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
97360,2011-03-31,Adelaide,10.8,20.3,0.0,3.6,10.7,E,28.0,,...,0.0,9.0,64.0,38.0,1029.7,1026.2,,,12.8,19.8
25605,2012-11-07,Adelaide,12.8,23.3,0.2,5.0,,WNW,44.0,NW,...,9.0,20.0,64.0,38.0,1012.5,1011.2,,,16.7,22.3
31948,2015-04-03,Adelaide,12.1,23.0,0.0,,,ESE,52.0,E,...,15.0,9.0,46.0,24.0,1024.2,1020.5,,,13.1,22.6
11800,2012-10-15,Adelaide,16.2,27.6,0.0,15.2,,N,37.0,N,...,9.0,15.0,60.0,24.0,1018.3,1016.7,,,18.6,26.7
92946,2013-10-14,Adelaide,9.3,17.4,2.0,13.8,7.0,W,37.0,SSW,...,7.0,19.0,74.0,39.0,1026.1,1025.4,,,11.8,16.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92369,2015-06-06,Woomera,6.6,20.2,0.0,6.6,9.0,NNE,31.0,NE,...,11.0,17.0,65.0,39.0,1030.1,1026.3,0.0,0.0,12.7,19.4
90023,2013-12-24,Woomera,16.1,28.0,0.0,15.6,5.1,SSE,44.0,SE,...,20.0,13.0,48.0,29.0,1018.0,1015.2,7.0,7.0,18.6,26.4
83230,2016-03-31,Woomera,12.6,27.2,0.0,9.4,10.4,SW,28.0,ESE,...,15.0,9.0,61.0,21.0,1021.3,1017.8,0.0,,15.7,25.3
4512,2011-09-09,Woomera,6.7,19.3,0.2,6.8,10.7,S,56.0,SSW,...,33.0,35.0,68.0,29.0,1028.2,1026.0,0.0,3.0,11.2,18.3


In [39]:
Xtrainc.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
0,2015-05-08,Uluru,8.9,23.2,0.0,,,E,46.0,E,...,26.0,19.0,52.0,27.0,1025.0,1020.7,,,14.4,21.8
1,2016-10-16,Walpole,8.8,15.0,4.4,,,WNW,35.0,W,...,15.0,19.0,66.0,71.0,1024.5,1023.4,,,13.5,13.8
2,2013-12-06,Cobar,8.1,23.6,0.0,10.0,,WSW,48.0,S,...,17.0,19.0,35.0,15.0,1017.4,1016.3,0.0,0.0,14.4,21.9
3,2011-01-24,SalmonGums,15.7,26.7,3.0,,,S,52.0,ENE,...,19.0,33.0,77.0,63.0,,,,,19.4,22.5
4,2013-03-28,Mildura,15.3,23.9,10.4,9.6,6.1,SW,41.0,W,...,6.0,22.0,93.0,40.0,1019.5,1019.5,8.0,7.0,16.2,21.2
