In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


In [2]:
data = pd.read_csv('weather.csv')


In [3]:
data.columns


Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RISK_MM', 'RainTomorrow'],
      dtype='object')

In [4]:
data.head

<bound method NDFrame.head of      MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine WindGustDir  \
0        8.0     24.3       0.0          3.4       6.3          NW   
1       14.0     26.9       3.6          4.4       9.7         ENE   
2       13.7     23.4       3.6          5.8       3.3          NW   
3       13.3     15.5      39.8          7.2       9.1          NW   
4        7.6     16.1       2.8          5.6      10.6         SSE   
..       ...      ...       ...          ...       ...         ...   
361      9.0     30.7       0.0          7.6      12.1         NNW   
362      7.1     28.4       0.0         11.6      12.7           N   
363     12.5     19.9       0.0          8.4       5.3         ESE   
364     12.5     26.9       0.0          5.0       7.1          NW   
365     12.3     30.2       0.0          6.0      12.6          NW   

     WindGustSpeed WindDir9am WindDir3pm  WindSpeed9am  ...  Humidity3pm  \
0             30.0         SW         NW           6.

In [5]:
data[data.isnull().any(axis=1)]


Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
113,12.7,28.6,6.6,3.2,8.6,W,50.0,,W,,...,42,1008.3,1002.3,5,5,16.5,27.4,Yes,0.0,No
118,11.7,27.6,0.0,7.8,8.1,W,48.0,,W,,...,25,1013.6,1009.1,2,6,17.6,26.8,No,0.0,No
127,10.8,29.2,0.0,8.4,7.5,E,50.0,,ENE,,...,34,1021.6,1017.7,0,7,17.5,27.7,No,3.0,Yes
138,12.5,29.9,0.0,5.8,10.7,NW,43.0,,WNW,0.0,...,30,1022.7,1018.8,0,1,16.8,29.2,No,0.0,No
140,11.5,29.3,0.0,5.2,8.0,NW,46.0,,WSW,0.0,...,35,1013.8,1009.4,7,7,17.1,27.6,No,0.0,No
174,7.5,19.0,0.0,4.0,6.8,ENE,26.0,,ESE,0.0,...,43,1025.5,1022.2,6,5,12.6,18.1,No,0.0,No
176,5.0,20.9,0.0,2.0,8.9,WNW,22.0,,NW,0.0,...,42,1019.9,1014.3,7,6,10.1,20.6,No,0.2,No
180,-2.1,13.8,0.2,1.8,9.5,NNW,22.0,,NNW,0.0,...,40,1020.6,1019.6,0,1,6.3,13.2,No,0.0,No
189,-0.2,18.1,0.0,4.4,9.4,NW,24.0,,NW,0.0,...,44,1021.4,1018.9,1,1,6.7,16.9,No,0.0,No
193,9.4,19.2,0.0,2.2,7.7,,24.0,E,NNW,4.0,...,47,1024.2,1020.3,7,1,12.1,18.8,No,0.0,No


In [7]:
before_rows = data.shape[0]
print(before_rows)


366


In [8]:
after_rows = data.shape[0]
print(after_rows)


366


In [9]:
before_rows - after_rows


0

In [11]:
clean_data = data.copy()
humidity_level=24.99
clean_data['high_humidity_label'] = (clean_data['Humidity3pm'] > humidity_level)*1
print(clean_data['high_humidity_label'])


0      1
1      1
2      1
3      1
4      1
      ..
361    0
362    0
363    1
364    1
365    0
Name: high_humidity_label, Length: 366, dtype: int32


In [12]:
y=clean_data[['high_humidity_label']].copy()
y



Unnamed: 0,high_humidity_label
0,1
1,1
2,1
3,1
4,1
...,...
361,0
362,0
363,1
364,1


In [14]:
y.head()



Unnamed: 0,high_humidity_label
0,1
1,1
2,1
3,1
4,1


In [15]:
morning_features = ['WindDir9am','WindDir3pm','WindSpeed9am','Humidity3pm','Pressure9am','Pressure3pm','Cloud9am','Cloud3pm','Temp9am','Temp3pm']


In [16]:
X = clean_data[morning_features].copy()


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=324)


In [18]:
print(type(X_train),
type(X_test),
type(y_train),
type(y_test))


<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>


In [19]:
X_train.head()


Unnamed: 0,WindDir9am,WindDir3pm,WindSpeed9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
186,NE,WNW,2.0,41,1019.8,1015.8,6,5,7.7,18.5
80,NW,WNW,19.0,71,1004.9,1004.0,7,8,19.4,21.9
233,E,WNW,6.0,59,1023.0,1021.0,3,6,4.2,12.1
257,,NNW,,46,1023.0,1020.1,1,7,3.9,13.3
315,NE,N,4.0,20,1018.8,1013.9,1,1,6.0,16.3


In [21]:
y_train.head()


Unnamed: 0,high_humidity_label
186,1
80,1
233,1
257,1
315,0


ValueError: could not convert string to float: 'NNW'