In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
weather = pd.read_csv(r'weather.csv', index_col=0) # index_col=0:第一列为索引值

In [3]:
weather.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,24.0,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,22.0,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,26.0,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,9.0,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No


In [4]:
# 将特征矩阵和标签分开
X = weather.iloc[:, :-1]
Y = weather.iloc[:, -1]

In [5]:
X.shape

(142193, 21)

In [6]:
Y.shape

(142193,)

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142193 entries, 0 to 142192
Data columns (total 21 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           142193 non-null  object 
 1   Location       142193 non-null  object 
 2   MinTemp        141556 non-null  float64
 3   MaxTemp        141871 non-null  float64
 4   Rainfall       140787 non-null  float64
 5   Evaporation    81350 non-null   float64
 6   Sunshine       74377 non-null   float64
 7   WindGustDir    132863 non-null  object 
 8   WindGustSpeed  132923 non-null  float64
 9   WindDir9am     132180 non-null  object 
 10  WindDir3pm     138415 non-null  object 
 11  WindSpeed9am   140845 non-null  float64
 12  WindSpeed3pm   139563 non-null  float64
 13  Humidity9am    140419 non-null  float64
 14  Humidity3pm    138583 non-null  float64
 15  Pressure9am    128179 non-null  float64
 16  Pressure3pm    128212 non-null  float64
 17  Cloud9am       88536 non-null

In [8]:
X.isnull().mean() # 缺失值

Date             0.000000
Location         0.000000
MinTemp          0.004480
MaxTemp          0.002265
Rainfall         0.009888
Evaporation      0.427890
Sunshine         0.476929
WindGustDir      0.065615
WindGustSpeed    0.065193
WindDir9am       0.070418
WindDir3pm       0.026570
WindSpeed9am     0.009480
WindSpeed3pm     0.018496
Humidity9am      0.012476
Humidity3pm      0.025388
Pressure9am      0.098556
Pressure3pm      0.098324
Cloud9am         0.377353
Cloud3pm         0.401525
Temp9am          0.006358
Temp3pm          0.019171
dtype: float64

In [9]:
Y.isnull().mean()

0.0

In [10]:
np.unique(Y) # 探索标签的分类

array(['No', 'Yes'], dtype=object)

In [11]:
# 将数据分为训练集和测试集
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=420)
for i in [Xtrain, Xtest, Ytrain, Ytest]: # 恢复索引
    i.index = range(i.shape[0])

In [12]:
# 样本不平衡问题探索
Ytrain.value_counts()

No     77238
Yes    22297
Name: RainTomorrow, dtype: int64

In [13]:
Ytest.value_counts()

No     33078
Yes     9580
Name: RainTomorrow, dtype: int64

In [14]:
# 将标签进行编码
from sklearn.preprocessing import LabelEncoder
encorder = LabelEncoder().fit(Ytrain) 
Ytrain = pd.DataFrame(encorder.transform(Ytrain))
Ytest = pd.DataFrame(encorder.transform(Ytest))

In [15]:
np.unique(Ytrain)

array([0, 1])

In [16]:
np.unique(Ytest)

array([0, 1])

In [17]:
# 描述性统计与异常值
Xtrain.describe([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,99%,max
MinTemp,99092.0,12.179886,6.404463,-8.5,-1.809,1.8,3.9,7.6,12.0,16.8,20.8,25.8,31.9
MaxTemp,99292.0,23.212779,7.110535,-4.1,9.1,12.8,14.5,17.9,22.6,28.2,32.9,40.1,48.1
Rainfall,98535.0,2.348739,8.438048,0.0,0.0,0.0,0.0,0.0,0.0,0.6,6.0,37.4,367.6
Evaporation,56898.0,5.455923,4.164123,0.0,0.4,1.0,1.4,2.6,4.8,7.4,10.2,18.206,86.2
Sunshine,51968.0,7.609452,3.782961,0.0,0.0,0.3,1.5,4.8,8.4,10.6,12.0,13.4,14.3
WindGustSpeed,93045.0,39.988285,13.624007,6.0,15.0,20.0,24.0,31.0,39.0,48.0,57.0,81.0,135.0
WindSpeed9am,98591.0,14.006319,8.922327,0.0,0.0,0.0,4.0,7.0,13.0,19.0,26.0,39.0,130.0
WindSpeed3pm,97698.0,18.633841,8.814635,0.0,2.0,6.0,9.0,13.0,19.0,24.0,30.0,43.0,83.0
Humidity9am,98284.0,68.845265,19.07418,0.0,17.0,34.0,44.0,57.0,70.0,83.0,94.0,100.0,100.0
Humidity3pm,96953.0,51.497798,20.776461,0.0,9.0,17.0,23.0,37.0,52.0,66.0,79.0,98.0,100.0


In [18]:
Xtest.describe([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,99%,max
MinTemp,42464.0,12.201599,6.400577,-7.6,-1.8,1.8,4.0,7.6,12.0,16.9,20.8,25.8,33.9
MaxTemp,42579.0,23.259442,7.134084,-4.8,9.0,12.8,14.5,18.0,22.7,28.2,33.0,40.3,47.0
Rainfall,42252.0,2.352854,8.528195,0.0,0.0,0.0,0.0,0.0,0.0,0.8,6.0,37.498,371.0
Evaporation,24452.0,5.502172,4.24471,0.0,0.4,1.0,1.4,2.6,4.8,7.4,10.2,18.6,145.0
Sunshine,22409.0,7.660569,3.778036,0.0,0.0,0.4,1.6,4.9,8.5,10.7,12.1,13.4,14.5
WindGustSpeed,39878.0,39.974974,13.506464,7.0,15.0,20.0,24.0,31.0,39.0,48.0,57.0,80.0,135.0
WindSpeed9am,42254.0,13.991882,8.825421,0.0,0.0,0.0,4.0,7.0,13.0,19.0,26.0,39.0,87.0
WindSpeed3pm,41865.0,18.646292,8.777042,0.0,2.0,6.0,9.0,13.0,19.0,24.0,30.0,43.0,87.0
Humidity9am,42135.0,68.840418,18.998023,3.0,18.0,34.0,44.0,57.0,70.0,83.0,94.0,100.0,100.0
Humidity3pm,41630.0,51.447226,20.847527,1.0,9.0,16.0,23.0,36.0,52.0,66.0,79.0,97.0,100.0


In [19]:
Xtrain.shape

(99535, 21)

In [20]:
Ytrain.shape

(99535, 1)

In [21]:
Xtrain.loc[Xtrain.loc[:, 'Cloud9am'] == 9, 'Cloud9am']

71737    9.0
Name: Cloud9am, dtype: float64

In [22]:
Xtest.loc[Xtrain.loc[:, 'Cloud9am'] == 9, 'Cloud9am']

Series([], Name: Cloud9am, dtype: float64)

In [23]:
Xtrain.drop(index = 71737)
Ytrain.drop(index = 71737)

Unnamed: 0,0
0,0
1,1
2,0
3,1
4,0
...,...
99530,0
99531,1
99532,0
99533,0


In [24]:
Xtest = Xtest.drop(index = [19646, 29632])
Ytest = Ytest.drop(index = [19646, 29632])

In [25]:
for i in [Xtrain, Xtest, Ytrain, Ytest]:
    i.index = range(i.shape[0])

In [26]:
# 处理日期特征
Xtrainc = Xtrain.copy()

In [27]:
Xtrainc.sort_values(by='Location')

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
97360,2011-03-31,Adelaide,10.8,20.3,0.0,3.6,10.7,E,28.0,,...,0.0,9.0,64.0,38.0,1029.7,1026.2,,,12.8,19.8
25605,2012-11-07,Adelaide,12.8,23.3,0.2,5.0,,WNW,44.0,NW,...,9.0,20.0,64.0,38.0,1012.5,1011.2,,,16.7,22.3
31948,2015-04-03,Adelaide,12.1,23.0,0.0,,,ESE,52.0,E,...,15.0,9.0,46.0,24.0,1024.2,1020.5,,,13.1,22.6
11800,2012-10-15,Adelaide,16.2,27.6,0.0,15.2,,N,37.0,N,...,9.0,15.0,60.0,24.0,1018.3,1016.7,,,18.6,26.7
92946,2013-10-14,Adelaide,9.3,17.4,2.0,13.8,7.0,W,37.0,SSW,...,7.0,19.0,74.0,39.0,1026.1,1025.4,,,11.8,16.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92369,2015-06-06,Woomera,6.6,20.2,0.0,6.6,9.0,NNE,31.0,NE,...,11.0,17.0,65.0,39.0,1030.1,1026.3,0.0,0.0,12.7,19.4
90023,2013-12-24,Woomera,16.1,28.0,0.0,15.6,5.1,SSE,44.0,SE,...,20.0,13.0,48.0,29.0,1018.0,1015.2,7.0,7.0,18.6,26.4
83230,2016-03-31,Woomera,12.6,27.2,0.0,9.4,10.4,SW,28.0,ESE,...,15.0,9.0,61.0,21.0,1021.3,1017.8,0.0,,15.7,25.3
4512,2011-09-09,Woomera,6.7,19.3,0.2,6.8,10.7,S,56.0,SSW,...,33.0,35.0,68.0,29.0,1028.2,1026.0,0.0,3.0,11.2,18.3


In [28]:
Xtrainc.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
0,2015-05-08,Uluru,8.9,23.2,0.0,,,E,46.0,E,...,26.0,19.0,52.0,27.0,1025.0,1020.7,,,14.4,21.8
1,2016-10-16,Walpole,8.8,15.0,4.4,,,WNW,35.0,W,...,15.0,19.0,66.0,71.0,1024.5,1023.4,,,13.5,13.8
2,2013-12-06,Cobar,8.1,23.6,0.0,10.0,,WSW,48.0,S,...,17.0,19.0,35.0,15.0,1017.4,1016.3,0.0,0.0,14.4,21.9
3,2011-01-24,SalmonGums,15.7,26.7,3.0,,,S,52.0,ENE,...,19.0,33.0,77.0,63.0,,,,,19.4,22.5
4,2013-03-28,Mildura,15.3,23.9,10.4,9.6,6.1,SW,41.0,W,...,6.0,22.0,93.0,40.0,1019.5,1019.5,8.0,7.0,16.2,21.2


In [29]:
Xtrain['Rainfall'].head(20)

0      0.0
1      4.4
2      0.0
3      3.0
4     10.4
5      0.0
6      0.0
7      0.0
8      0.0
9      1.4
10     4.4
11     0.6
12     NaN
13     NaN
14     0.0
15    11.2
16     3.0
17     0.4
18     0.0
19     1.6
Name: Rainfall, dtype: float64

In [30]:
# 将日期特征转化为'今天是否下雨'和'月份'两个特征
# 今天是否下雨的特征
Xtrain.loc[Xtrain['Rainfall'] >= 1, 'RainToday'] = 'Yes'
Xtrain.loc[Xtrain['Rainfall'] < 1, 'RainToday'] = 'No'
Xtrain.loc[Xtrain['Rainfall'] == np.nan, 'RainToday'] = np.nan
Xtest.loc[Xtest['Rainfall'] >= 1, 'RainToday'] = 'Yes'
Xtest.loc[Xtest['Rainfall'] < 1, 'RainToday'] = 'No'
Xtest.loc[Xtest['Rainfall'] == np.nan, 'RainToday'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [31]:
Xtrain.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,2015-05-08,Uluru,8.9,23.2,0.0,,,E,46.0,E,...,19.0,52.0,27.0,1025.0,1020.7,,,14.4,21.8,No
1,2016-10-16,Walpole,8.8,15.0,4.4,,,WNW,35.0,W,...,19.0,66.0,71.0,1024.5,1023.4,,,13.5,13.8,Yes
2,2013-12-06,Cobar,8.1,23.6,0.0,10.0,,WSW,48.0,S,...,19.0,35.0,15.0,1017.4,1016.3,0.0,0.0,14.4,21.9,No
3,2011-01-24,SalmonGums,15.7,26.7,3.0,,,S,52.0,ENE,...,33.0,77.0,63.0,,,,,19.4,22.5,Yes
4,2013-03-28,Mildura,15.3,23.9,10.4,9.6,6.1,SW,41.0,W,...,22.0,93.0,40.0,1019.5,1019.5,8.0,7.0,16.2,21.2,Yes


In [32]:
Xtest.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,2012-11-01,Williamtown,11.8,36.1,0.0,7.6,,S,54.0,NW,...,24.0,49.0,12.0,1010.0,1002.9,4.0,7.0,23.1,34.5,No
1,2016-08-06,Dartmoor,5.2,15.0,0.0,,,NNW,17.0,,...,4.0,94.0,56.0,1028.2,1025.0,,,6.6,13.7,No
2,2010-11-10,Penrith,17.6,29.5,0.2,,,SW,59.0,N,...,13.0,90.0,51.0,,,,,21.1,28.1,No
3,2015-08-01,Wollongong,10.8,20.7,0.0,,,NW,67.0,NW,...,17.0,49.0,37.0,1016.8,1014.2,8.0,5.0,15.3,20.5,No
4,2014-05-28,Mildura,9.6,18.8,4.2,3.6,7.2,W,31.0,NW,...,11.0,91.0,62.0,1015.9,1016.5,2.0,7.0,12.2,17.8,Yes


In [33]:
# 月份特征
Xtrain.loc[0, 'Date'].split('-')[1]

'05'

In [34]:
Xtrain['Date'] = Xtrain['Date'].apply(lambda x : int(x.split('-')[1]))
Xtrain = Xtrain.rename(columns={'Date':'Month'})
Xtest['Date'] = Xtest['Date'].apply(lambda x : int(x.split('-')[1]))
Xtest = Xtest.rename(columns={'Date':'Month'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Xtrain['Date'] = Xtrain['Date'].apply(lambda x : int(x.split('-')[1]))


In [35]:
Xtrain.head()

Unnamed: 0,Month,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,5,Uluru,8.9,23.2,0.0,,,E,46.0,E,...,19.0,52.0,27.0,1025.0,1020.7,,,14.4,21.8,No
1,10,Walpole,8.8,15.0,4.4,,,WNW,35.0,W,...,19.0,66.0,71.0,1024.5,1023.4,,,13.5,13.8,Yes
2,12,Cobar,8.1,23.6,0.0,10.0,,WSW,48.0,S,...,19.0,35.0,15.0,1017.4,1016.3,0.0,0.0,14.4,21.9,No
3,1,SalmonGums,15.7,26.7,3.0,,,S,52.0,ENE,...,33.0,77.0,63.0,,,,,19.4,22.5,Yes
4,3,Mildura,15.3,23.9,10.4,9.6,6.1,SW,41.0,W,...,22.0,93.0,40.0,1019.5,1019.5,8.0,7.0,16.2,21.2,Yes


In [36]:
Xtest.head()

Unnamed: 0,Month,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,11,Williamtown,11.8,36.1,0.0,7.6,,S,54.0,NW,...,24.0,49.0,12.0,1010.0,1002.9,4.0,7.0,23.1,34.5,No
1,8,Dartmoor,5.2,15.0,0.0,,,NNW,17.0,,...,4.0,94.0,56.0,1028.2,1025.0,,,6.6,13.7,No
2,11,Penrith,17.6,29.5,0.2,,,SW,59.0,N,...,13.0,90.0,51.0,,,,,21.1,28.1,No
3,8,Wollongong,10.8,20.7,0.0,,,NW,67.0,NW,...,17.0,49.0,37.0,1016.8,1014.2,8.0,5.0,15.3,20.5,No
4,5,Mildura,9.6,18.8,4.2,3.6,7.2,W,31.0,NW,...,11.0,91.0,62.0,1015.9,1016.5,2.0,7.0,12.2,17.8,Yes


In [37]:
# 处理地点特征
# 依据城市的经纬度来进行划分城市
cityll = pd.read_csv(r'cityll.csv', index_col=0)
city_climate = pd.read_csv(r'Cityclimate.csv')

In [38]:
cityll.head()

Unnamed: 0,City,Latitude,Longitude,Latitudedir,Longitudedir
0,Adelaide,34.9285°,138.6007°,"S,",E
1,Albany,35.0275°,117.8840°,"S,",E
2,Albury,36.0737°,146.9135°,"S,",E
3,Wodonga,36.1241°,146.8818°,"S,",E
4,AliceSprings,23.6980°,133.8807°,"S,",E


In [39]:
city_climate.head()

Unnamed: 0,City,Climate
0,Adelaide,Warm temperate
1,Albany,Mild temperate
2,Albury,"Hot dry summer, cool winter"
3,Wodonga,"Hot dry summer, cool winter"
4,AliceSprings,"Hot dry summer, warm winter"


In [40]:
# 将两张表合并一下
# 去掉度数符号
cityll['Latitudenum'] = cityll['Latitude'].apply(lambda x :float(x[:-1]))
cityll['Longitudenum'] = cityll['Longitude'].apply(lambda x :float(x[:-1]))
# 去掉经纬度
citylld = cityll.iloc[:, [0, 5, 6]]
# 将city_climate中的气候添加到citylld中
citylld['climate'] = city_climate.iloc[:, -1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  citylld['climate'] = city_climate.iloc[:, -1]


In [41]:
citylld.head()

Unnamed: 0,City,Latitudenum,Longitudenum,climate
0,Adelaide,34.9285,138.6007,Warm temperate
1,Albany,35.0275,117.884,Mild temperate
2,Albury,36.0737,146.9135,"Hot dry summer, cool winter"
3,Wodonga,36.1241,146.8818,"Hot dry summer, cool winter"
4,AliceSprings,23.698,133.8807,"Hot dry summer, warm winter"


In [42]:
# 获取到样本中所有的城市
samplecity = pd.read_csv(r'samplecity.csv', index_col=0)
samplecity['Latitudenum'] = samplecity['Latitude'].apply(lambda x:float(x[:-1]))
samplecity['Longitudenum'] = samplecity['Longitude'].apply(lambda x:float(x[:-1]))
samplecityd = samplecity.iloc[:, [0, 5, 6]]

In [43]:
samplecityd.head()

Unnamed: 0,City,Latitudenum,Longitudenum
0,Canberra,35.2809,149.13
1,Sydney,33.8688,151.2093
2,Perth,31.9505,115.8605
3,Darwin,12.4634,130.8456
4,Hobart,42.8821,147.3272


In [44]:
# 计算样本点中的城市与主要城市之间的距离
from math import radians, sin, cos, acos
# 将经纬度转换为弧度
citylld.loc[:, 'slat'] = citylld.iloc[:, 1].apply(lambda x: radians(x))
citylld.loc[:, 'slon'] = citylld.iloc[:, 2].apply(lambda x: radians(x))
samplecityd.loc[:, 'elat'] = samplecityd.iloc[:, 1].apply(lambda x: radians(x))
samplecityd.loc[:, 'elon'] = samplecityd.iloc[:, 2].apply(lambda x: radians(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [45]:
citylld.head()

Unnamed: 0,City,Latitudenum,Longitudenum,climate,slat,slon
0,Adelaide,34.9285,138.6007,Warm temperate,0.609617,2.419039
1,Albany,35.0275,117.884,Mild temperate,0.611345,2.057464
2,Albury,36.0737,146.9135,"Hot dry summer, cool winter",0.629605,2.564124
3,Wodonga,36.1241,146.8818,"Hot dry summer, cool winter",0.630484,2.563571
4,AliceSprings,23.698,133.8807,"Hot dry summer, warm winter",0.413608,2.336659


In [46]:
samplecityd.head()

Unnamed: 0,City,Latitudenum,Longitudenum,elat,elon
0,Canberra,35.2809,149.13,0.615768,2.60281
1,Sydney,33.8688,151.2093,0.591122,2.6391
2,Perth,31.9505,115.8605,0.557641,2.022147
3,Darwin,12.4634,130.8456,0.217527,2.283687
4,Hobart,42.8821,147.3272,0.748434,2.571345


In [47]:
import sys
for i in range(samplecityd.shape[0]):
    slat = citylld.loc[:, 'slat']
    slon = citylld.loc[:, 'slon']
    elat = samplecityd.loc[i, 'elat']
    elon = samplecityd.loc[i, 'elon']
    dist = 6371.01 * np.arccos(np.sin(slat) * np.sin(elat) + np.cos(slat) * np.cos(elat) * np.cos(slon.values - elon))
    city_index = np.argsort(dist)[0] # np.arsort：从小到大排序，返回索引
    samplecityd.loc[i, 'closest_city'] = citylld.loc[city_index, 'City']
    samplecityd.loc[i, 'climate'] = citylld.loc[city_index, 'climate']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [48]:
samplecityd.head()

Unnamed: 0,City,Latitudenum,Longitudenum,elat,elon,closest_city,climate
0,Canberra,35.2809,149.13,0.615768,2.60281,Canberra,Cool temperate
1,Sydney,33.8688,151.2093,0.591122,2.6391,Sydney,Warm temperate
2,Perth,31.9505,115.8605,0.557641,2.022147,Perth,Warm temperate
3,Darwin,12.4634,130.8456,0.217527,2.283687,Darwin,"High humidity summer, warm winter"
4,Hobart,42.8821,147.3272,0.748434,2.571345,Hobart,Cool temperate


In [49]:
samplecityd['climate'].value_counts()

Warm temperate                       15
Mild temperate                       10
Cool temperate                        9
Hot dry summer, cool winter           6
High humidity summer, warm winter     4
Hot dry summer, warm winter           3
Warm humid summer, mild winter        2
Name: climate, dtype: int64

In [50]:
# 取出样本城市所对应的气候并保存
locafinal = samplecityd.iloc[:, [0, -1]]
locafinal.columns = ['Location', 'Climate']
locafinal = locafinal.set_index(keys='Location')

In [51]:
locafinal.head()

Unnamed: 0_level_0,Climate
Location,Unnamed: 1_level_1
Canberra,Cool temperate
Sydney,Warm temperate
Perth,Warm temperate
Darwin,"High humidity summer, warm winter"
Hobart,Cool temperate


In [52]:
locafinal.to_csv(r'samplelocation.csv')

In [53]:
# 用气候替换原本中城市
import re
Xtrain['Location'] = Xtrain['Location'].map(locafinal.iloc[:, 0]).apply(lambda x: re.sub(',', '', x.strip()))
Xtest['Location'] = Xtest['Location'].map(locafinal.iloc[:, 0]).apply(lambda x: re.sub(',', '', x.strip()))
Xtrain = Xtrain.rename(columns={"Location":"Climate"})
Xtest = Xtest.rename(columns={"Location":"Climate"})

In [54]:
Xtrain.head()

Unnamed: 0,Month,Climate,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,5,Hot dry summer warm winter,8.9,23.2,0.0,,,E,46.0,E,...,19.0,52.0,27.0,1025.0,1020.7,,,14.4,21.8,No
1,10,Mild temperate,8.8,15.0,4.4,,,WNW,35.0,W,...,19.0,66.0,71.0,1024.5,1023.4,,,13.5,13.8,Yes
2,12,Hot dry summer cool winter,8.1,23.6,0.0,10.0,,WSW,48.0,S,...,19.0,35.0,15.0,1017.4,1016.3,0.0,0.0,14.4,21.9,No
3,1,Hot dry summer cool winter,15.7,26.7,3.0,,,S,52.0,ENE,...,33.0,77.0,63.0,,,,,19.4,22.5,Yes
4,3,Hot dry summer cool winter,15.3,23.9,10.4,9.6,6.1,SW,41.0,W,...,22.0,93.0,40.0,1019.5,1019.5,8.0,7.0,16.2,21.2,Yes


In [55]:
# 处理分类变量：填补缺失值
Xtrain.isnull().mean()

Month            0.000000
Climate          0.000000
MinTemp          0.004451
MaxTemp          0.002441
Rainfall         0.010047
Evaporation      0.428362
Sunshine         0.477892
WindGustDir      0.065645
WindGustSpeed    0.065203
WindDir9am       0.070849
WindDir3pm       0.026604
WindSpeed9am     0.009484
WindSpeed3pm     0.018456
Humidity9am      0.012568
Humidity3pm      0.025941
Pressure9am      0.098709
Pressure3pm      0.098488
Cloud9am         0.377847
Cloud3pm         0.402693
Temp9am          0.006108
Temp3pm          0.019370
RainToday        0.010047
dtype: float64

In [56]:
# 找出所有的分类特征
cate = Xtrain.columns[Xtrain.dtypes == 'object'].tolist()
cloud = ['Cloud9am', 'Cloud3pm']
cate = cate + cloud

In [57]:
cate

['Climate',
 'WindGustDir',
 'WindDir9am',
 'WindDir3pm',
 'RainToday',
 'Cloud9am',
 'Cloud3pm']

In [58]:
# 利用众数来进行填充
from sklearn.impute import SimpleImputer
si = SimpleImputer(missing_values = np.nan, strategy='most_frequent')
si.fit(Xtrain.loc[:, cate])

SimpleImputer(strategy='most_frequent')

In [59]:
Xtrain.loc[:, cate] = si.transform(Xtrain.loc[:, cate])
Xtest.loc[:, cate] = si.transform(Xtest.loc[:, cate])

In [60]:
Xtrain.loc[:, cate].isnull().mean()

Climate        0.0
WindGustDir    0.0
WindDir9am     0.0
WindDir3pm     0.0
RainToday      0.0
Cloud9am       0.0
Cloud3pm       0.0
dtype: float64

In [61]:
Xtest.loc[:, cate].isnull().mean()

Climate        0.0
WindGustDir    0.0
WindDir9am     0.0
WindDir3pm     0.0
RainToday      0.0
Cloud9am       0.0
Cloud3pm       0.0
dtype: float64

In [62]:
# 将分类型变量进行编码
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
oe = oe.fit(Xtrain.loc[:, cate])
Xtrain.loc[:, cate] = oe.transform(Xtrain.loc[:, cate])
Xtest.loc[:, cate] = oe.transform(Xtest.loc[:, cate])

In [63]:
Xtrain.loc[:, cate].head()

Unnamed: 0,Climate,WindGustDir,WindDir9am,WindDir3pm,RainToday,Cloud9am,Cloud3pm
0,3.0,0.0,0.0,1.0,0.0,7.0,7.0
1,4.0,14.0,13.0,13.0,1.0,7.0,7.0
2,2.0,15.0,8.0,11.0,0.0,0.0,0.0
3,2.0,8.0,1.0,10.0,1.0,7.0,7.0
4,2.0,12.0,13.0,15.0,1.0,8.0,7.0


In [64]:
Xtest.loc[:, cate].head()

Unnamed: 0,Climate,WindGustDir,WindDir9am,WindDir3pm,RainToday,Cloud9am,Cloud3pm
0,6.0,8.0,7.0,14.0,0.0,4.0,7.0
1,4.0,6.0,3.0,3.0,0.0,7.0,7.0
2,6.0,12.0,3.0,0.0,0.0,7.0,7.0
3,6.0,7.0,7.0,5.0,0.0,8.0,5.0
4,2.0,13.0,7.0,15.0,1.0,2.0,7.0


In [65]:
# 处理连续型变量：填补缺失值
col = Xtrain.columns.tolist()
for i in cate:
    col.remove(i)

In [66]:
col

['Month',
 'MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Temp9am',
 'Temp3pm']

In [67]:
# 利用均值进行填充
impean = SimpleImputer(missing_values = np.nan, strategy = 'mean')
impean = impean.fit(Xtrain.loc[:, col])
Xtrain.loc[:, col] = impean.transform(Xtrain.loc[:, col])
Xtest.loc[:, col] = impean.transform(Xtest.loc[:, col])

In [68]:
Xtrain.head()

Unnamed: 0,Month,Climate,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,5.0,3.0,8.9,23.2,0.0,5.455923,7.609452,0.0,46.0,0.0,...,19.0,52.0,27.0,1025.0,1020.7,7.0,7.0,14.4,21.8,0.0
1,10.0,4.0,8.8,15.0,4.4,5.455923,7.609452,14.0,35.0,13.0,...,19.0,66.0,71.0,1024.5,1023.4,7.0,7.0,13.5,13.8,1.0
2,12.0,2.0,8.1,23.6,0.0,10.0,7.609452,15.0,48.0,8.0,...,19.0,35.0,15.0,1017.4,1016.3,0.0,0.0,14.4,21.9,0.0
3,1.0,2.0,15.7,26.7,3.0,5.455923,7.609452,8.0,52.0,1.0,...,33.0,77.0,63.0,1017.649291,1015.254827,7.0,7.0,19.4,22.5,1.0
4,3.0,2.0,15.3,23.9,10.4,9.6,6.1,12.0,41.0,13.0,...,22.0,93.0,40.0,1019.5,1019.5,8.0,7.0,16.2,21.2,1.0


In [69]:
Xtest.head()

Unnamed: 0,Month,Climate,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,11.0,6.0,11.8,36.1,0.0,7.6,7.609452,8.0,54.0,7.0,...,24.0,49.0,12.0,1010.0,1002.9,4.0,7.0,23.1,34.5,0.0
1,8.0,4.0,5.2,15.0,0.0,5.455923,7.609452,6.0,17.0,3.0,...,4.0,94.0,56.0,1028.2,1025.0,7.0,7.0,6.6,13.7,0.0
2,11.0,6.0,17.6,29.5,0.2,5.455923,7.609452,12.0,59.0,3.0,...,13.0,90.0,51.0,1017.649291,1015.254827,7.0,7.0,21.1,28.1,0.0
3,8.0,6.0,10.8,20.7,0.0,5.455923,7.609452,7.0,67.0,7.0,...,17.0,49.0,37.0,1016.8,1014.2,8.0,5.0,15.3,20.5,0.0
4,5.0,2.0,9.6,18.8,4.2,3.6,7.2,13.0,31.0,7.0,...,11.0,91.0,62.0,1015.9,1016.5,2.0,7.0,12.2,17.8,1.0


In [70]:
# 处理连续型变量：无量纲化
col.remove('Month')

In [71]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss = ss.fit(Xtrain.loc[:, col])
Xtrain.loc[:, col] = ss.transform(Xtrain.loc[:, col])
Xtest.loc[:, col] = ss.transform(Xtest.loc[:, col])

In [72]:
Xtrain.head()

Unnamed: 0,Month,Climate,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,5.0,3.0,-0.513271,-0.001799,-0.279761,-5.642218e-16,6.49864e-16,0.0,0.456391,0.0,...,0.041929,-0.888752,-1.194717,1.089573,0.8156459,7.0,7.0,-0.398283,0.01816,0.0
1,10.0,4.0,-0.52892,-1.156434,0.244328,-5.642218e-16,6.49864e-16,14.0,-0.378695,13.0,...,0.041929,-0.150115,0.95109,1.01546,1.220086,7.0,7.0,-0.537352,-1.147621,1.0
2,12.0,2.0,-0.638464,0.054524,-0.279761,1.443329,6.49864e-16,15.0,0.608225,8.0,...,0.041929,-1.785668,-1.779937,-0.03695165,0.156559,0.0,0.0,-0.398283,0.032732,0.0
3,1.0,2.0,0.550864,0.491032,0.077573,-5.642218e-16,6.49864e-16,8.0,0.911893,1.0,...,1.645067,0.430242,0.560943,-1.685145e-14,1.702943e-14,7.0,7.0,0.374326,0.120165,1.0
4,3.0,2.0,0.488268,0.096767,0.958996,1.316278,-0.5522193,12.0,0.076806,13.0,...,0.385458,1.274398,-0.560728,0.2743249,0.6358949,8.0,7.0,-0.120144,-0.069274,1.0


In [73]:
Xtest.head()

Unnamed: 0,Month,Climate,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,11.0,6.0,-0.059449,1.814638,-0.279761,0.6810203,6.49864e-16,8.0,1.063727,7.0,...,0.614478,-1.047031,-1.926242,-1.133831,-1.85066,4.0,7.0,0.946057,1.868836,0.0
1,8.0,4.0,-1.092286,-1.156434,-0.279761,-5.642218e-16,6.49864e-16,6.0,-1.745201,3.0,...,-1.675719,1.327158,0.219565,1.563899,1.459753,7.0,7.0,-1.603552,-1.162193,0.0
2,11.0,6.0,0.848196,0.885298,-0.255939,-5.642218e-16,6.49864e-16,12.0,1.443312,3.0,...,-0.64513,1.116119,-0.024277,-1.685145e-14,1.702943e-14,7.0,7.0,0.637013,0.936212,0.0
3,8.0,6.0,-0.215939,-0.353822,-0.279761,-5.642218e-16,6.49864e-16,7.0,2.050648,7.0,...,-0.187091,-1.047031,-0.707034,-0.1258878,-0.1580051,8.0,5.0,-0.259213,-0.17128,0.0
4,5.0,2.0,-0.403728,-0.621359,0.220506,-0.5894943,-0.1497943,13.0,-0.682363,7.0,...,-0.87415,1.168879,0.512175,-0.2592921,0.1865175,2.0,7.0,-0.738231,-0.564731,1.0


In [74]:
Ytrain.head()

Unnamed: 0,0
0,0
1,1
2,0
3,1
4,0


In [75]:
Ytest.head()

Unnamed: 0,0
0,1
1,0
2,1
3,0
4,0


In [76]:
# 建模与模型评估
from time import time
import datetime
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, recall_score

In [None]:
Ytrain = Ytrain.iloc[:, 0].ravel()
Ytest = Ytest.iloc[:, 0].ravel()
# for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
#     clf = SVC(kernel=kernel,
#              gamma='auto',
#              degree=1,
#              cache_size=5000
#              ).fit(Xtrain, Ytrain)
#     result = clf.predict(Xtest)
#     score = clf.score(Xtest, Ytest)
#     recall = recall_score(Ytest, result)
#     auc = roc_auc_score(Ytest, clf.decision_function(Xtest))
#     print('%s testing accuracy %f, recall is %f, auc is %f' % (kernel, score, recall, auc))

In [1]:
import matplotlib.pyplot as plt
C_range = np.linspace(0.01,20,20)
recallall = []
aucall = []
scoreall = []
for C in C_range:
    clf = SVC(kernel = "linear",C=C,cache_size = 5000
            ,class_weight = "balanced"
            ).fit(Xtrain, Ytrain)
    result = clf.predict(Xtest)
    score = clf.score(Xtest,Ytest)
    recall = recall_score(Ytest, result)
    auc = roc_auc_score(Ytest,clf.decision_function(Xtest))
    recallall.append(recall)
    aucall.append(auc)
    scoreall.append(score)
    print("under C %f, testing accuracy is %f,recall is %f', auc is %f" %
    (C,score,recall,auc))
    print(max(aucall),C_range[aucall.index(max(aucall))])
plt.figure()
plt.plot(C_range,recallall,c="red",label="recall")
plt.plot(C_range,aucall,c="black",label="auc")
plt.plot(C_range,scoreall,c="orange",label="accuracy")
plt.legend()
plt.show()

IndentationError: expected an indented block (<ipython-input-1-87e96b8f7cc1>, line 7)