In [15]:
# Import libraries
import os
import requests
import pandas as pd
import tensorflow as tf
import tensorflow_data_validation as tfdv
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
print('TF version:', tf.__version__)
print('TFDV version:', tfdv.version.__version__)

TF version: 2.13.0
TFDV version: 1.14.0


# 2. Descripción del Conjunto de Datos

In [3]:
## download the dataset
# Directory of the raw data files
_data_root = './data/covertype'
# Path to the raw training data
_data_filepath = os.path.join(_data_root, 'covertype_train.csv')
# Download data
os.makedirs(_data_root, exist_ok=True)
if not os.path.isfile(_data_filepath):
    #https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
    url = 'https://docs.google.com/uc?export= \
    download&confirm={{VALUE}}&id=1lVF1BCWLH4eXXV_YOJzjR7xZjj-wAGj9'
    r = requests.get(url, allow_redirects=True, stream=True)
    open(_data_filepath, 'wb').write(r.content)

In [13]:
data = pd.read_csv(_data_filepath)

In [14]:
data.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2991,119,7,67,11,1015,233,234,133,1570,Commanche,C7202,1
1,2876,3,18,485,71,2495,192,202,144,1557,Commanche,C7757,1
2,3171,315,2,277,9,4374,213,237,162,1052,Rawah,C7745,0
3,3087,342,13,190,31,4774,193,221,166,752,Rawah,C7745,0
4,2835,158,10,212,41,3596,231,242,141,3280,Rawah,C4744,1


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116203 entries, 0 to 116202
Data columns (total 13 columns):
 #   Column                              Non-Null Count   Dtype 
---  ------                              --------------   ----- 
 0   Elevation                           116203 non-null  int64 
 1   Aspect                              116203 non-null  int64 
 2   Slope                               116203 non-null  int64 
 3   Horizontal_Distance_To_Hydrology    116203 non-null  int64 
 4   Vertical_Distance_To_Hydrology      116203 non-null  int64 
 5   Horizontal_Distance_To_Roadways     116203 non-null  int64 
 6   Hillshade_9am                       116203 non-null  int64 
 7   Hillshade_Noon                      116203 non-null  int64 
 8   Hillshade_3pm                       116203 non-null  int64 
 9   Horizontal_Distance_To_Fire_Points  116203 non-null  int64 
 10  Wilderness_Area                     116203 non-null  object
 11  Soil_Type                           116

## Inicializar el repositorio de DVC y añadir los datos

In [4]:
!dvc init

Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[0m

In [8]:
!git commit -m 'Initialize DVC'

[main 06c1f44] Initialize DVC
 3 files changed, 6 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore


In [9]:
!dvc add data/

[?25l[32m⠋[0m Checking graph                                       core[39m>
Adding...                                                                       
![A
Collecting files and computing hashes in data         |0.00 [00:00,     ?file/s][A
                                                                                [A
![A
  0% Checking cache in '/work/.dvc/cache/files/md5'| |0/? [00:00<?,    ?files/s][A
                                                                                [A
![A
  0%|          |Adding data to cache                  0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /work/data               0/2 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 24.26file/s][A

To track the changes with git, run:

	git add .gitignore data.dvc

To enable auto staging, run:

	dvc config core.autostage true
[0m

# 3. Seleción de características

In [41]:
# Selecciona solo las características numéricas
data_numeric_features = data.select_dtypes(include=['number'])
# Separar target de features
X = data_numeric_features.drop('Cover_Type', axis=1)  # Assuming 'Cover_Type' is the target
y = data['Cover_Type']

# Feature selection
selector = SelectKBest(f_classif, k=8) 
X_new = selector.fit_transform(X, y)

selected_features = X.columns[selector.get_support()]
features_retain = {feature: (feature in selected_features) for feature in X.columns}

In [42]:
features_retain

{'Elevation': True,
 'Aspect': False,
 'Slope': True,
 'Horizontal_Distance_To_Hydrology': True,
 'Vertical_Distance_To_Hydrology': True,
 'Horizontal_Distance_To_Roadways': True,
 'Hillshade_9am': True,
 'Hillshade_Noon': True,
 'Hillshade_3pm': False,
 'Horizontal_Distance_To_Fire_Points': True}

In [47]:
data_selected_features = data[list(selected_features) + ['Cover_Type']]

In [49]:
data_selected_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116203 entries, 0 to 116202
Data columns (total 9 columns):
 #   Column                              Non-Null Count   Dtype
---  ------                              --------------   -----
 0   Elevation                           116203 non-null  int64
 1   Slope                               116203 non-null  int64
 2   Horizontal_Distance_To_Hydrology    116203 non-null  int64
 3   Vertical_Distance_To_Hydrology      116203 non-null  int64
 4   Horizontal_Distance_To_Roadways     116203 non-null  int64
 5   Hillshade_9am                       116203 non-null  int64
 6   Hillshade_Noon                      116203 non-null  int64
 7   Horizontal_Distance_To_Fire_Points  116203 non-null  int64
 8   Cover_Type                          116203 non-null  int64
dtypes: int64(9)
memory usage: 8.0 MB
