UCSanDiegoX: DSE200x Python for Data Science

Week 7 - Introduction to Machine Learning

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

Classification of Weather Data using scikit-learn 

In [2]:
data = pd.read_csv('./weather/daily_weather.csv')

print(data.columns)
print(data.head())

# count the rows having an undefined value in it
print(data.shape, data[data.isnull().any(axis=1)].shape[0])

Index(['number', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')
   number  air_pressure_9am  air_temp_9am  avg_wind_direction_9am  \
0       0        918.060000     74.822000              271.100000   
1       1        917.347688     71.403843              101.935179   
2       2        923.040000     60.638000               51.000000   
3       3        920.502751     70.138895              198.832133   
4       4        921.160000     44.294000              277.800000   

   avg_wind_speed_9am  max_wind_direction_9am  max_wind_speed_9am  \
0            2.080354              295.400000            2.863283   
1            2.443009              140.471548            3.533324   
2           17.067852               63.700000           22.100967   
3            4.33736

Data Cleaning

In [3]:
# not interested in sample id
del data['number']

In [4]:
print('Rows before cleanup', data.shape[0])
data = data.dropna()
print('Rows after cleanup', data.shape[0])

Rows before cleanup 1095
Rows after cleanup 1064


Task: predict weather in the afternoon given data in the morning

In [5]:
clean_data = data.copy()
clean_data['high_humidity_label'] = (clean_data['relative_humidity_3pm'] > 24.99) * 1
print(clean_data['high_humidity_label'].head())

0    1
1    0
2    0
3    0
4    1
Name: high_humidity_label, dtype: int32


In [6]:
# target is named 'y'
y=clean_data[['high_humidity_label']].copy()
print(clean_data['relative_humidity_3pm'].head())
print(y.head())

0    36.160000
1    19.426597
2    14.460000
3    12.742547
4    76.740000
Name: relative_humidity_3pm, dtype: float64
   high_humidity_label
0                    1
1                    0
2                    0
3                    0
4                    1


In [7]:
# Use 9am Sensor Signals as Features to Predict Humidity at 3pm
morning_features = [
    'air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am',
    'max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am', 'rain_duration_9am']
X = clean_data[morning_features].copy()
print(X.columns)
print(y.columns)

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am'],
      dtype='object')
Index(['high_humidity_label'], dtype='object')


Perform Test and Train split 

In [8]:
# from SKLearn model selection
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=324)

In [9]:
# print(type(X_train), type(X_test), type(y_train), type(y_test))
#X_train.head()
#y_train.describe()

In [10]:
# Fit on Train Set
humidity_classifier = DecisionTreeClassifier(max_leaf_nodes=10, random_state=0)
humidity_classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=10,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [11]:
# Predict on Test Set
predictions = humidity_classifier.predict(X_test)
print(predictions[:10])
print(y_test['high_humidity_label'][:10])

# Measure Accuracy of the Classifier 
print('Accuracy:', accuracy_score(y_true = y_test, y_pred = predictions))

[0 0 1 1 1 1 0 0 0 1]
456     0
845     0
693     1
259     1
723     1
224     1
300     1
442     0
585     1
1057    1
Name: high_humidity_label, dtype: int32
Accuracy: 0.8153409090909091
