In [1]:
import pandas as pd
# Load the original dataset
dataset = pd.read_csv('datasets/covertype_norm_train.csv')
test    = pd.read_csv('datasets/covertype_norm_test.csv')
# Preview
dataset.head()

Unnamed: 0,elevation,aspect,slope,horiz_dist_hydro,vert_dist_hydro,horiz_dist_road,hillshade_9,hill_shade_noon,hill_shade_15,horiz_dist_fire,...,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39,cover_type
0,-0.573753,-0.518424,-0.428658,0.436024,-0.475092,-0.979056,0.927864,0.14452,-0.534162,-0.220768,...,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986,3
1,1.656009,-0.010549,0.868502,-0.516497,-0.280544,1.81761,0.862413,0.665801,-0.534162,2.273548,...,-0.214265,4.938531,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986,7
2,0.169501,-0.799569,0.632655,0.45517,1.89191,-0.388051,0.796962,-1.245563,-1.335438,-0.687429,...,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986,5
3,-1.205043,1.268208,1.576043,0.23499,1.648725,-0.649457,-2.933743,-0.15956,1.956291,-0.501856,...,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986,6
4,-1.057345,0.152697,0.986425,0.134472,0.530073,-1.041945,0.404256,1.056762,-0.014415,-0.79477,...,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986,3


In [2]:
# Separate descriptors (data) from targets
features_data = dataset.drop('cover_type', axis=1, inplace=False)
targets = dataset['cover_type']

features_test_data = test.drop('cover_type', axis=1, inplace=False)
targets_test = test['cover_type']
# Print info
print("Train data size: " + str(features_data.shape))
print("Train target size: " + str(targets.shape))
print("Test data size: " + str(features_test_data.shape))
print("Test target size: " + str(targets_test.shape))


Train data size: (14421, 54)
Train target size: (14421,)
Test data size: (4808, 54)
Test target size: (4808,)


## PCA

In [3]:
from sklearn.decomposition import PCA
from sklearn.externals import joblib

# PCA selection rate
rates = [75, 85, 95]
# Run PCA
for rate in rates:
    # PCA instance
    pca = PCA(rate/100, svd_solver='full')
    # Fit model
    pca.fit(features_data)
    # Dump model
    joblib.dump(pca, 'models/pca_' + str(rate) + '.save')
    # Transform data
    principal_components = pca.transform(features_data)
    # Make a DataFrame of principal components
    result_pca_df = pd.DataFrame(data=principal_components, index=features_data.index)
    # Concatenate with the targets
    final_pca_df = pd.concat([result_pca_df,targets], axis=1)
    # Save the dataset
    print('[INFO] Saving pca_' + str(rate) + '...')
    final_pca_df.to_csv('results/pca_' + str(rate) + '.csv', index=False)

print('[INFO] Done.')

[INFO] Saving pca_75...
[INFO] Saving pca_85...
[INFO] Saving pca_95...
[INFO] Done.


In [4]:
from sklearn.decomposition import PCA
from sklearn.externals import joblib

# PCA selection rate
rates = [75, 85, 95]
# Run PCA
for rate in rates:
    # load pca model
    pca = joblib.load('models/pca_' + str(rate) + '.save')
    # Transform data
    principal_components = pca.transform(features_test_data)
    # Make a DataFrame of principal components
    result_pca_df = pd.DataFrame(data=principal_components, index=features_test_data.index)
    # Concatenate with the targets
    final_pca_df = pd.concat([result_pca_df,targets_test], axis=1)
    # Check size
    print('[INFO] PCA ' + str(rate) + ' test shape:' + str(final_pca_df.shape))
    # Save the dataset
    print('[INFO] Saving pca_test_' + str(rate) + '...')
    final_pca_df.to_csv('results/pca_test_' + str(rate) + '.csv', index=False)

print('[INFO] Done.')

[INFO] PCA 75 test shape:(4808, 30)
[INFO] Saving pca_test_75...
[INFO] PCA 85 test shape:(4808, 35)
[INFO] Saving pca_test_85...
[INFO] PCA 95 test shape:(4808, 41)
[INFO] Saving pca_test_95...
[INFO] Done.


## LDA

In [5]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.externals import joblib

# Number of components to reduce
rate_components = range(1,7)

for rate_component in rate_components:
    # Compute n_components
    n_components = rate_component
    # Create LDA instance
    lda = LinearDiscriminantAnalysis(n_components=n_components)
    # Fit LDA
    lda.fit(features_data, targets)
    # Dump model
    joblib.dump(lda, 'models/lda_' + str(rate_component) + '.save')
    # Save extracted attributes
    lda_data = lda.transform(features_data)
    # Make DataFrame
    lda_df = pd.DataFrame(lda_data, 
                          index=features_data.index)
    # Include targets
    final_lda_df = pd.concat([lda_df, targets], axis=1)
    # Save DataFrame
    print('[INFO] Saving lda_' + str(rate_component) + '...')
    final_lda_df.to_csv('results/lda_' + str(rate_component) + '.csv', index=False)

print("[INFO] Done.")



[INFO] Saving lda_1...




[INFO] Saving lda_2...




[INFO] Saving lda_3...




[INFO] Saving lda_4...




[INFO] Saving lda_5...




[INFO] Saving lda_6...
[INFO] Done.


In [6]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.externals import joblib

# Number of components to reduce
rate_components = range(1,7)

for rate_component in rate_components:
    # load lda model
    lda = joblib.load('models/lda_' + str(rate_component) + '.save')
    # Save extracted attributes
    lda_data = lda.transform(features_test_data)
    # Make DataFrame
    lda_df = pd.DataFrame(lda_data, 
                          index=features_test_data.index)
    # Include targets
    final_lda_df = pd.concat([lda_df, targets_test], axis=1)
    # Check size
    print('[INFO] LDA ' + str(rate_component) + ' test shape:' + str(final_lda_df.shape))
    # Save DataFrame
    print('[INFO] Saving lda_test_' + str(rate_component) + '...')
    final_lda_df.to_csv('results/lda_test_' + str(rate_component) + '.csv', index=False)

print("[INFO] Done.")

[INFO] LDA 1 test shape:(4808, 2)
[INFO] Saving lda_test_1...
[INFO] LDA 2 test shape:(4808, 3)
[INFO] Saving lda_test_2...
[INFO] LDA 3 test shape:(4808, 4)
[INFO] Saving lda_test_3...
[INFO] LDA 4 test shape:(4808, 5)
[INFO] Saving lda_test_4...
[INFO] LDA 5 test shape:(4808, 6)
[INFO] Saving lda_test_5...
[INFO] LDA 6 test shape:(4808, 7)
[INFO] Saving lda_test_6...
[INFO] Done.
