In [37]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

df = pd.read_csv("OccupancyTrain.csv")

In [38]:
df

Unnamed: 0,id,Temperature,Humidity,Light,CO2,HumidityRatio,Occupied
0,1,23.18,27.2720,426.0,721.250000,0.004793,yes
1,2,23.15,27.2675,429.5,714.000000,0.004783,yes
2,3,23.15,27.2450,426.0,713.500000,0.004779,yes
3,4,23.15,27.2000,426.0,708.250000,0.004772,yes
4,5,23.10,27.2000,426.0,704.500000,0.004757,yes
...,...,...,...,...,...,...,...
8138,8139,21.05,36.0975,433.0,787.250000,0.005579,yes
8139,8140,21.05,35.9950,433.0,789.500000,0.005563,yes
8140,8141,21.10,36.0950,433.0,798.500000,0.005596,yes
8141,8142,21.10,36.2600,433.0,820.333333,0.005621,yes


I need to change the categorical values in the Occupied column into numeric values. First I need to check for the unique values in the column so I know what needs to be changed.

In [39]:
df["Occupied"].unique()

array(['yes', 'no', nan, 'Yes', 'Y', 'N ', 'No', 'Y '], dtype=object)

In [40]:
df.isnull().sum()

id               0
Temperature      0
Humidity         0
Light            0
CO2              0
HumidityRatio    0
Occupied         3
dtype: int64

This code showed that there are only 3 null values in the dataset. Since they are in the Occupied column (the variable to be predicted) I am choosing to delete those 3 entries using the dropna function. Since there are over 8,000 rows of data, deleting 3 won't be a significant loss. 

In [41]:
df.dropna(axis=0, inplace=True)

In [42]:
# verify the drop worked

df.isnull().sum()

id               0
Temperature      0
Humidity         0
Light            0
CO2              0
HumidityRatio    0
Occupied         0
dtype: int64

In [43]:
# Change categorical values into numeric

df['Occupied'] = df['Occupied'].replace( {'N': 0, 'Y': 1, 'yes': 1, 'no': 0, 'Yes': 1, 'N ': 0, 'No': 0, 'Y ': 1 } ).astype(int)

  df['Occupied'] = df['Occupied'].replace( {'N': 0, 'Y': 1, 'yes': 1, 'no': 0, 'Yes': 1, 'N ': 0, 'No': 0, 'Y ': 1 } ).astype(int)


In [44]:
df["Occupied"].unique()

array([1, 0])

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8140 entries, 0 to 8142
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             8140 non-null   int64  
 1   Temperature    8140 non-null   float64
 2   Humidity       8140 non-null   float64
 3   Light          8140 non-null   float64
 4   CO2            8140 non-null   float64
 5   HumidityRatio  8140 non-null   float64
 6   Occupied       8140 non-null   int64  
dtypes: float64(5), int64(2)
memory usage: 508.8 KB


In [46]:
df.describe()

Unnamed: 0,id,Temperature,Humidity,Light,CO2,HumidityRatio,Occupied
count,8140.0,8140.0,8140.0,8140.0,8140.0,8140.0,8140.0
mean,4073.35086,20.618656,25.731379,119.502981,606.533668,0.003862,0.212285
std,2350.195031,1.016853,5.532199,194.738921,314.364389,0.000852,0.408951
min,1.0,19.0,16.745,0.0,412.75,0.002674,0.0
25%,2038.75,19.7,20.2,0.0,439.0,0.003078,0.0
50%,4073.5,20.39,26.2225,0.0,453.5,0.003801,0.0
75%,6108.25,21.39,30.533333,256.1875,638.75,0.004352,0.0
max,8143.0,23.18,39.1175,1546.333333,2028.5,0.006476,1.0


In [47]:
df.to_csv('Cleaned_Occupancy_Train.csv', index=False)

In [48]:
df.sort_values('Light', ascending=True).tail(10)

Unnamed: 0,id,Temperature,Humidity,Light,CO2,HumidityRatio,Occupied
1146,1147,22.856667,26.323333,676.0,1024.75,0.004535,0
1144,1145,22.7225,26.4725,686.25,1027.0,0.004523,1
1112,1113,22.6,26.79,688.5,1100.5,0.004544,1
1106,1107,22.39,26.79,696.5,1078.0,0.004486,1
1109,1110,22.4725,26.89,708.75,1099.0,0.004526,1
1143,1144,22.675,26.525,732.75,1038.5,0.004519,1
1108,1109,22.426667,27.0,744.0,1098.333333,0.004531,1
3833,3834,20.76,18.856667,829.0,452.666667,0.00285,0
3832,3833,20.745,18.89,1451.75,453.0,0.002853,0
3831,3832,20.7,18.89,1546.333333,455.333333,0.002845,0


In [49]:
df.corr()

Unnamed: 0,id,Temperature,Humidity,Light,CO2,HumidityRatio,Occupied
id,1.0,-0.441828,0.642842,-0.121985,0.12238,0.504866,-0.098121
Temperature,-0.441828,1.0,-0.141825,0.650179,0.560005,0.151627,0.538365
Humidity,0.642842,-0.141825,1.0,0.0379,0.439068,0.955218,0.133057
Light,-0.121985,0.650179,0.0379,1.0,0.664013,0.230527,0.907304
CO2,0.12238,0.560005,0.439068,0.664013,1.0,0.626616,0.712238
HumidityRatio,0.504866,0.151627,0.955218,0.230527,0.626616,1.0,0.300395
Occupied,-0.098121,0.538365,0.133057,0.907304,0.712238,0.300395,1.0


In [50]:
df_test = pd.read_csv('OccupancyTest.csv')
df_test

Unnamed: 0,id,Temperature,Humidity,Light,CO2,HumidityRatio,Occupied
0,140,23.700000,26.272000,585.200000,749.200000,0.004764,yes
1,141,23.718000,26.290000,578.400000,760.400000,0.004773,yes
2,142,23.730000,26.230000,572.666667,769.666667,0.004765,yes
3,143,23.722500,26.125000,493.750000,774.750000,0.004744,yes
4,144,23.754000,26.200000,488.600000,779.000000,0.004767,yes
...,...,...,...,...,...,...,...
2660,2800,24.290000,25.700000,808.000000,1150.250000,0.004829,yes
2661,2801,24.330000,25.736000,809.800000,1129.200000,0.004848,yes
2662,2802,24.330000,25.700000,817.000000,1125.800000,0.004841,yes
2663,2803,24.356667,25.700000,813.000000,1123.000000,0.004849,yes


In [51]:
df_test["Occupied"].unique()

array(['yes', 'no'], dtype=object)

In [52]:
df_test['Occupied'] = df_test['Occupied'].replace( {'yes': 1, 'no': 0} ).astype(int)
df_test

  df_test['Occupied'] = df_test['Occupied'].replace( {'yes': 1, 'no': 0} ).astype(int)


Unnamed: 0,id,Temperature,Humidity,Light,CO2,HumidityRatio,Occupied
0,140,23.700000,26.272000,585.200000,749.200000,0.004764,1
1,141,23.718000,26.290000,578.400000,760.400000,0.004773,1
2,142,23.730000,26.230000,572.666667,769.666667,0.004765,1
3,143,23.722500,26.125000,493.750000,774.750000,0.004744,1
4,144,23.754000,26.200000,488.600000,779.000000,0.004767,1
...,...,...,...,...,...,...,...
2660,2800,24.290000,25.700000,808.000000,1150.250000,0.004829,1
2661,2801,24.330000,25.736000,809.800000,1129.200000,0.004848,1
2662,2802,24.330000,25.700000,817.000000,1125.800000,0.004841,1
2663,2803,24.356667,25.700000,813.000000,1123.000000,0.004849,1


For this dataset, generate three different decision trees using Python.  You can vary the predictors to get three different trees. For each tree:    
•	state the features you considered,  
•	and state the accuracy of the model,  
•	and show the Python statements that you used.

In [59]:
X_train = df[['Light', 'CO2']]
y_train = df[['Occupied']]
X_test = df_test[['Light', 'CO2']]
y_test = df_test[['Occupied']]

In [54]:
X_test

Unnamed: 0,Light,CO2
0,585.200000,749.200000
1,578.400000,760.400000
2,572.666667,769.666667
3,493.750000,774.750000
4,488.600000,779.000000
...,...,...
2660,808.000000,1150.250000
2661,809.800000,1129.200000
2662,817.000000,1125.800000
2663,813.000000,1123.000000


In [57]:
y_test

Unnamed: 0,Occupied
0,1
1,1
2,1
3,1
4,1
...,...
2660,1
2661,1
2662,1
2663,1


In [60]:
y_train

Unnamed: 0,Occupied
0,1
1,1
2,1
3,1
4,1
...,...
8138,1
8139,1
8140,1
8141,1


In [61]:
dtree_1 = DecisionTreeClassifier(criterion='gini').fit(X_train, y_train)
score_1 = dtree_1.score(X_test, y_test)
score_1

0.924202626641651

In [62]:
dtree_2 = DecisionTreeClassifier(max_depth=5).fit(X_train, y_train)
score_2 = dtree_2.score(X_test, y_test)
score_2

0.9320825515947467

In [64]:
dtree_3 = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
score_3 = dtree_3.score(X_test, y_test)
score_3

0.9782363977485928

In [65]:
dot_data = tree.export_graphviz(dtree_3, out_file='OccupiedDecisionTree.dot',
                               feature_names= ['Light', 'CO2'],
                               class_names= ['Unoccuped', 'Occupied'],
                               rounded=True,
                               filled=True)