In [1]:
import pandas as pd
# Load the original dataset
dataset = pd.read_csv('datasets/covertype_norm_train.csv')
# Preview
dataset.head()

Unnamed: 0,elevation,aspect,slope,horiz_dist_hydro,vert_dist_hydro,horiz_dist_road,hillshade_9,hill_shade_noon,hill_shade_15,horiz_dist_fire,...,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39,cover_type
0,-1.929805,1.477831,1.116461,-0.948019,-0.487945,-1.219708,-2.28235,-0.643397,1.322887,-0.934126,...,-0.214972,-0.205073,-0.04331,-0.079247,-0.014425,-0.041672,-0.224338,-0.218305,-0.174891,3
1,1.644997,1.640937,-0.184168,1.692577,0.475531,0.161461,-0.87414,-0.07569,0.761317,-0.947689,...,-0.214972,-0.205073,-0.04331,-0.079247,-0.014425,-0.041672,-0.224338,4.580746,-0.174891,7
2,1.404774,-0.642539,0.998222,-0.182629,-0.096023,0.350281,1.123554,-1.298443,-1.679356,0.476446,...,-0.214972,-0.205073,-0.04331,-0.079247,-0.014425,-0.041672,-0.224338,4.580746,-0.174891,7
3,-0.412357,1.668121,-0.302407,-1.091529,-0.830878,-0.281626,-0.710395,0.01165,0.69652,3.189085,...,-0.214972,-0.205073,-0.04331,-0.079247,-0.014425,-0.041672,-0.224338,-0.218305,-0.174891,2
4,0.429612,1.713428,-1.248319,-0.316572,-0.504275,2.519079,-0.120911,0.535686,0.545328,0.169918,...,-0.214972,-0.205073,-0.04331,-0.079247,-0.014425,-0.041672,-0.224338,-0.218305,-0.174891,5


In [2]:
# Separate descriptors (data) from targets
features_data = dataset.drop('cover_type', axis=1, inplace=False)
targets = dataset['cover_type']
# Print info
print("Data size: " + str(features_data.shape))
print("Target size: " + str(targets.shape))

Data size: (14421, 54)
Target size: (14421,)


## PCA

In [4]:
from sklearn.decomposition import PCA
from sklearn.externals import joblib

# PCA selection rate
rates = [75, 85, 95]
# Run PCA
for rate in rates:
    # PCA instance
    pca = PCA(rate/100)
    # Fit model
    pca.fit(features_data)
    # Dump model
    joblib.dump(pca, 'models/pca_' + str(rate) + '.save')
    # Transform data
    principal_components = pca.transform(features_data)
    # Make a DataFrame of principal components
    result_pca_df = pd.DataFrame(data=principal_components, index=features_data.index)
    # Concatenate with the targets
    final_pca_df = pd.concat([result_pca_df,targets], axis=1)
    # Save the dataset
    print('[INFO] Saving pca_' + str(rate) + '...')
    final_pca_df.to_csv('results/pca_' + str(rate) + '.csv')

print('[INFO] Done.')

[INFO] Saving pca_75...
[INFO] Saving pca_85...
[INFO] Saving pca_95...
[INFO] Done.


## LDA

In [12]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.externals import joblib

# Number of components to reduce
rate_components = [25, 50, 75]

for rate_component in rate_components:
    # Compute n_components
    n_components = round(rate_component/100 * features_data.shape[1])
    # Create LDA instance
    lda = LinearDiscriminantAnalysis(n_components=n_components)
    # Fit LDA
    lda.fit(features_data, targets)
    # Dump model
    joblib.dump(pca, 'models/lda_' + str(rate_component) + '.save')
    # Save extracted attributes
    lda_data = lda.transform(features_data)
    # Make DataFrame
    lda_df = pd.DataFrame(lda_data, 
                          index=features_data.index)
    # Include targets
    final_lda_df = pd.concat([lda_df, targets], axis=1)
    # Save DataFrame
    print('[INFO] Saving lda_' + str(rate_component) + '...')
    final_lda_df.to_csv('results/lda_' + str(rate_component) + '.csv')

print("[INFO] Done.")



[INFO] Saving lda_25...
[INFO] Saving lda_50...
[INFO] Saving lda_75...
[INFO] Done.
