In [2]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer

In [3]:
df = pd.read_csv('Data/daily_weather.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   number                  1095 non-null   int64  
 1   air_pressure_9am        1092 non-null   float64
 2   air_temp_9am            1090 non-null   float64
 3   avg_wind_direction_9am  1091 non-null   float64
 4   avg_wind_speed_9am      1092 non-null   float64
 5   max_wind_direction_9am  1092 non-null   float64
 6   max_wind_speed_9am      1091 non-null   float64
 7   rain_accumulation_9am   1089 non-null   float64
 8   rain_duration_9am       1092 non-null   float64
 9   relative_humidity_9am   1095 non-null   float64
 10  relative_humidity_3pm   1095 non-null   float64
dtypes: float64(10), int64(1)
memory usage: 94.2 KB


In [5]:
df[df.isnull().any(axis=1)]

Unnamed: 0,number,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
16,16,917.89,,169.2,2.192201,196.8,2.930391,0.0,0.0,48.99,51.19
111,111,915.29,58.82,182.6,15.613841,189.0,,0.0,0.0,21.5,29.69
177,177,915.9,,183.3,4.719943,189.9,5.346287,0.0,0.0,29.26,46.5
262,262,923.596607,58.380598,47.737753,10.636273,67.145843,13.671423,0.0,,17.990876,16.461685
277,277,920.48,62.6,194.4,2.751436,,3.869906,0.0,0.0,52.58,54.03
334,334,916.23,75.74,149.1,2.751436,187.5,4.183078,,1480.0,31.88,32.9
358,358,917.44,58.514,55.1,10.021491,,12.705819,0.0,0.0,13.88,25.93
361,361,920.444946,65.801845,49.823346,21.520177,61.886944,25.549112,,40.364018,12.278715,7.618649
381,381,918.48,66.542,90.9,3.467257,89.4,4.406772,,0.0,20.64,14.35
409,409,,67.853833,65.880616,4.328594,78.570923,5.216734,0.0,0.0,18.487385,20.356594


In [6]:
df.isnull().sum()

number                    0
air_pressure_9am          3
air_temp_9am              5
avg_wind_direction_9am    4
avg_wind_speed_9am        3
max_wind_direction_9am    3
max_wind_speed_9am        4
rain_accumulation_9am     6
rain_duration_9am         3
relative_humidity_9am     0
relative_humidity_3pm     0
dtype: int64

In [7]:
# Separar columnas numéricas y categóricas
num_cols = df.select_dtypes(include=['float64' , 'int64']).columns

# Reimputar valores nulos para asegurarnos de que todo se procesa correctamente
num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Verificar nuevamente valores nulos
print("Valores nulos restantes después de la imputación:")
print(df.isnull().sum().sum())  # Esto debería ser 0

Valores nulos restantes después de la imputación:
0


In [8]:
del df['number']

In [9]:
df.head()

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42,36.16
1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697,19.426597
2,923.04,60.638,51.0,17.067852,63.7,22.100967,0.0,20.0,8.9,14.46
3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102,12.742547
4,921.16,44.294,277.8,1.85666,136.5,2.863283,8.9,14730.0,92.41,76.74


In [10]:
df.describe()

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
count,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0
mean,918.882657,64.936574,142.322322,5.503799,149.03118,7.01193,0.201966,293.302277,34.241402,35.344727
std,3.179793,11.150071,69.026253,4.547373,67.16212,5.58937,1.589646,1595.960236,25.472067,22.524079
min,907.99,36.752,15.5,0.693451,28.9,1.185578,0.0,0.0,6.09,5.3
25%,916.565752,57.353,66.109634,2.25412,76.66357,3.077758,0.0,0.0,15.092243,17.395
50%,918.921045,65.715479,166.0,3.871333,177.3,4.943637,0.0,0.0,23.179259,24.38
75%,921.16,73.43592,190.9,7.325978,201.15,8.872373,0.0,0.0,45.4,52.06
max,929.32,98.906,343.4,23.554978,312.2,29.84078,24.02,17704.0,92.62,92.25


In [11]:
df_clean = df.copy()
df_clean['high_humidity_label'] = (df_clean['relative_humidity_3pm'] > 24.99)*1 #Binarizamos para poder urilizar mejor el arbol de desicion
df_clean.head()

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm,high_humidity_label
0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42,36.16,1
1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697,19.426597,0
2,923.04,60.638,51.0,17.067852,63.7,22.100967,0.0,20.0,8.9,14.46,0
3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102,12.742547,0
4,921.16,44.294,277.8,1.85666,136.5,2.863283,8.9,14730.0,92.41,76.74,1


In [12]:
# Guardamos el objetivo en y
y = df_clean[['high_humidity_label']].copy()

In [13]:
df_clean['relative_humidity_3pm'].head()

0    36.160000
1    19.426597
2    14.460000
3    12.742547
4    76.740000
Name: relative_humidity_3pm, dtype: float64

In [14]:
y.head()

Unnamed: 0,high_humidity_label
0,1
1,0
2,0
3,0
4,1


In [15]:
df.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')

In [16]:
# Features
morning_features = [
      'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am'
]

In [17]:
X = df_clean[morning_features].copy()
X.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am'],
      dtype='object')

In [18]:
y.columns

Index(['high_humidity_label'], dtype='object')

In [19]:
# Particion de set de datos 
X_train , X_test , Y_train , Y_test = train_test_split(X , y , test_size=0.33 , random_state=324)

In [20]:
# Clasificador
humidity_classifier = DecisionTreeClassifier(max_leaf_nodes= 15 , random_state=0)
humidity_classifier.fit(X_train,Y_train)

In [21]:
type(humidity_classifier)

sklearn.tree._classes.DecisionTreeClassifier

In [22]:
predictions = humidity_classifier.predict(X_test) 
predictions[:20]

array([1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1])

In [23]:
Y_test['high_humidity_label'][:20]

292     0
143     1
497     0
325     1
221     0
516     1
180     1
449     0
704     1
810     0
1063    0
588     1
599     0
952     0
49      0
785     1
761     0
456     0
214     0
670     1
Name: high_humidity_label, dtype: int64

In [24]:
# Medimos la presicion
accuracy_score(y_true= Y_test , y_pred= predictions)

0.8839779005524862