In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pysptools.noise as noise

In [2]:

#from taking n purest pixels 
purest_pixels_dict = np.load('./purest_pixels_dict.npy').item()

#from subtracting out signal from background
pure_signal_dict = np.load('./pure_signal_dict.npy').item()

In [3]:
#http://evols.library.manoa.hawaii.edu/bitstream/handle/10524/35872/vol3-Poop-Que(LO).pdf#page=11
#Olivenza: Took average for LL chondrite 
#'H' has about 25-30% total Iron (with over half in metallic form =>strongly magnetic)
#L contains 20-25% (with 5-10% in uncombined metal state)
#'LL' contain 19-22% iron (with only 0.3-3% metallic iron)
CLASSIFICATION_PER_SPECIMEN =  {'Abee':'EH4', 'Acapulco':'Acapulcoite', 'Allende':'CV3','Brownsfield':'H3.7',
                                'Estacado':'H6', 'Estacado2':'H6', 'Gibeon':'IronIva', 'Hessle':'H5', 
                                'Holbrook':"L/LL6", 'Homestead':'L5','Homestead2':'L5','Millbilille':'Eucrite-mmict',
                                'Olivenza':"LL5", 'Peekshill':'H6',
                                'PutnamCounty':'IronIva', 'Soku':'LL4', 'Steinbach1':'IronIva', 'Steinbach2':'IronIva', 
                                'Sutton':'H5','Toluca1':'IronIAB-sLL', 'Toluca2':'IronIAB-sLL', 'Toluca3':'IronIAB-sLL', 
                                'TolucaBig':'IronIAB-sLL'}

IRON_CATEGORY_PER_SPECIMEN = {'Abee':32.52, 'Acapulco':27.5, 'Allende':23.85,'Brownsfield':'H','Estacado':27.88, 
                                'Estacado2':27.88, 'Gibeon':91.8, 'Hessle':'H', 'Holbrook':"L/LL", 'Homestead':'L',
                                'Homestead2':'L','Millbilille':'L','Olivenza':"LL", 'Peekshill':'H',
                                'PutnamCounty':91.57, 'Soku':'LL', 'Steinbach1':'HH', 'Steinbach2':'HH', 'Sutton':'H', 
                                'Toluca1':91, 'Toluca2':91, 'Toluca3':91, 'TolucaBig':91}

IRON_ALL_CATEGORIZED = {'Abee':'H', 'Acapulco':'H', 'Allende':'L','Brownsfield':'H','Estacado':'H', 
                                'Estacado2':'H', 'Gibeon':'HH', 'Hessle':'H', 'Holbrook':"L/LL", 'Homestead':'L',
                                'Homestead2':'L','Millbilille':'L','Olivenza':"LL", 'Peekshill':'H',
                                'PutnamCounty':'H', 'Soku':'LL', 'Steinbach1':'HH', 'Steinbach2':'HH', 'Sutton':'H', 
                                'Toluca1':'HH', 'Toluca2':'HH', 'Toluca3':'HH', 'TolucaBig':'HH'}

IRON_SIMPLE_CATEGORIZED = {'Abee':'L', 'Acapulco':'L', 'Allende':'L','Brownsfield':'L','Estacado':'L', 
                                'Estacado2':'L', 'Gibeon':'HH', 'Hessle':'L', 'Holbrook':"L", 'Homestead':'L',
                                'Homestead2':'L','Millbilille':'L','Olivenza':"L", 'Peekshill':'L',
                                'PutnamCounty':'L', 'Soku':'L', 'Steinbach1':'HH', 'Steinbach2':'HH', 'Sutton':'', 
                                'Toluca1':'HH', 'Toluca2':'HH', 'Toluca3':'HH', 'TolucaBig':'HH'}

IRON_PERCENTAGE_IF_AVAILABLE = {'Abee':32.52, 'Acapulco':27.5, 'Allende':23.85,'Estacado':27.88, 
                                'Estacado2':27.88, 'Gibeon':91.8, 'PutnamCounty':91.57,'Toluca1':91, 
                                'Toluca2':91, 'Toluca3':91, 'TolucaBig':91}
COLORS = {'Abee':'darkslateblue', 'Acapulco':'green', 'Allende':'blue','Brownsfield':'yellow',
                                'Estacado':'purple', 'Estacado2':'brown', 'Gibeon':'black', 'Hessle':'lime', 
                                'Holbrook':"orange", 'Homestead':'grey','Homestead2':'lightgreen',
                                  'Millbilille':'lightcoral',
                                'Olivenza':"c", 'Peekshill':'cyan',
                                'PutnamCounty':'pink', 'Soku':'silver', 'Steinbach1':'maroon', 'Steinbach2':'fuchsia', 
                                'Sutton':'lawngreen','Toluca1':'cyan', 'Toluca2':'ivory', 'Toluca3':'olive', 
                                'TolucaBig':'red'}


In [5]:
#create matrix of all data samples (no classes)
all_data = []
all_data_simplified_classes = []
all_data_standard_classes = []
simplified_classes = ['HH','L']
standard_classes = ['HH','H','L','L/LL','LL']
all_data_meteorite_classes = []

for i,sample in enumerate(pure_signal_dict): 
    if sample == 'TolucaBig':
        continue
    for j,row in enumerate(pure_signal_dict[sample]): 
        all_data.append(row)
        all_data_simplified_classes.append(IRON_SIMPLE_CATEGORIZED[sample])
        all_data_standard_classes.append(IRON_ALL_CATEGORIZED[sample])
        all_data_meteorite_classes.append(sample)
        

print np.shape(all_data)
print np.shape(all_data_simplified_classes)

(210, 6)
(210,)


In [6]:
all_data_mean = []
all_data_mean_classes = []
for sample in pure_signal_dict: 
    all_data_mean.append(np.mean(pure_signal_dict[sample], axis=0))
    all_data_mean_classes.append(sample)
    
print np.shape(all_data_mean)
    
    

(23, 6)


In [7]:
import itertools
band = [610,680,730,760,810,860]

def add_many_variables(spectrum):
    pairs = list(itertools.combinations(spectrum, 2))
  
    
    differences = [abs(b-a) for (a,b) in pairs]
    differences_squared = [abs(b-a)**2 for (a,b) in pairs]
 
    ratios = [float(b)/float(a) for (a,b) in pairs]
    ratios_squared = [(float(b)/float(a))**2 for (a,b) in pairs]
    #return spectrum
    #return np.concatenate((spectrum,differences))
    #return np.concatenate((spectrum,differences))
    #return ratios
    #based on expected iron changes
    #return [spectrum[0],spectrum[1],spectrum[5]]
    return np.concatenate((spectrum,differences,ratios))
    #sums 
    #slopes = [(b-a)/(band[i] for i, (a,b) in enumerate(pairs)]
#NOTE: tried a bunch of stuff. Seemed like just using the spectrum itself worked best, though it would be 
#nice in partiuclar to pay attention to the ratio results since this helps get rid of any noise across all channels

In [8]:
mega_feature_array = []

def build_mega_feature_array(original_dataset): 
    for sample in original_dataset: 
        mega_feature_array.append(add_many_variables(sample))

build_mega_feature_array(all_data)
  
print np.shape(all_data)
print np.shape(mega_feature_array)


(210, 6)
(210, 36)


In [9]:
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
print np.shape(mega_feature_array)
data_r = sklearn_pca.fit_transform(mega_feature_array)
print np.shape(data_r)
print sklearn_pca.explained_variance_ratio_
print sklearn_pca.components_

 
 

(210, 36)
(210, 2)
[ 0.70477734  0.16011543]
[[-0.2825636  -0.16694153 -0.26128759 -0.26361607 -0.25820146 -0.2715704
   0.08316441 -0.03216278 -0.02873968 -0.03085958 -0.02125287  0.08235836
   0.09373624  0.08355293  0.09880148 -0.00092847 -0.01661171 -0.00885368
  -0.01674159 -0.02227843  0.00322328  0.49793035 -0.00728724 -0.07665931
  -0.04758264 -0.14298851 -0.22207525 -0.24275334 -0.23173032 -0.27000519
  -0.07770653 -0.04694213 -0.15401308  0.04244999 -0.09926519 -0.12421777]
 [-0.17738475 -0.08791792 -0.21119332 -0.17878777 -0.0851785   0.01040101
   0.02942258  0.00117845 -0.00778032 -0.07252975 -0.11912745  0.09506114
   0.08600737 -0.00973696 -0.09000301 -0.01923371 -0.04221744 -0.08247341
   0.03384668  0.00910202 -0.02683339  0.0090798  -0.08267549 -0.01123034
   0.15301749  0.36163176 -0.15586922 -0.11070578  0.02622985  0.17024926
   0.06660319  0.25028132  0.47247539  0.19400612  0.45329252  0.24511387]]


In [10]:
colors = {}
for i, row in enumerate(data_r): 
    if all_data_standard_classes[i] == 'HH': 
        plt.scatter(row[0],row[1], color='red')
    elif all_data_standard_classes[i] == 'H':
        plt.scatter(row[0],row[1], color='green')
    elif all_data_standard_classes[i] == 'L':
        plt.scatter(row[0],row[1], color='blue')
    elif all_data_standard_classes[i] == 'L/LL':
        plt.scatter(row[0],row[1], color='yellow')
    elif all_data_standard_classes[i] == 'LL':
        plt.scatter(row[0],row[1], color='purple')
#plt.xlim(-2,0)
plt.show()
#plt.savefig('../results/PCA_5_cat')
 
     

In [11]:
colors = {}
for i, row in enumerate(data_r): 
    if all_data_standard_classes[i] == 'HH': 
        plt.scatter(row[0],row[1], color='red')
    elif all_data_standard_classes[i] == 'H':
        continue
    elif all_data_standard_classes[i] == 'L':
        plt.scatter(row[0],row[1], color='blue')
    elif all_data_standard_classes[i] == 'L/LL':
        plt.scatter(row[0],row[1], color='yellow')
    elif all_data_standard_classes[i] == 'LL':
        plt.scatter(row[0],row[1], color='purple')
#plt.xlim(-2,1)
plt.show()
#plt.savefig('../results/PCA_4_cat')

In [None]:
colors = {}
for i, row in enumerate(data_r): 
    if all_data_standard_classes[i] == 'H':
        continue
    elif all_data_standard_classes[i] == 'L':
        plt.scatter(row[0],row[1], color='blue')
    elif all_data_standard_classes[i] == 'L/LL':
        plt.scatter(row[0],row[1], color='blue')
    elif all_data_standard_classes[i] == 'LL':
        plt.scatter(row[0],row[1], color='blue')
    elif all_data_standard_classes[i] == 'HH': 
        plt.scatter(row[0],row[1], color='red')
#plt.xlim(-3,0)
plt.show()
#plt.savefig('../results/PCA_HHv3Ls')
 

In [40]:
for i, row in enumerate(data_r): 
    plt.scatter(row[0],row[1], color=COLORS[all_data_meteorite_classes[i]])
plt.show()
#plt.savefig('../results/PCA_meteorite')

In [36]:
#Next: MNF
#Then: MDA

MNF = noise.MNF()
#data = np.reshape(all_data,(10,21,6))
print (mega_feature_array[0])
result = MNF.apply(np.array([mega_feature_array]))
print(result[0])
#print np.shape(result)
#print np.shape(MNF.get_components(2))
np.reshape(MNF.get_components(2),(210,2))

#MNF.display_components(n_first=1)

[ 0.62782775  0.72484072  0.55037799  0.58893332  0.60340918  0.5545497
  0.09701297  0.07744976  0.03889443  0.02441856  0.07327805  0.17446272
  0.1359074   0.12143153  0.17029102  0.03855533  0.05303119  0.00417171
  0.01447587  0.03438362  0.04885949  1.15452163  0.87663853  0.9380492
  0.96110627  0.88328319  0.75930888  0.81250033  0.83247143  0.76506422
  1.07005245  1.09635413  1.00757971  1.02457981  0.94161712  0.9190276 ]
[[  2.88205714e+02  -4.94060282e+01   5.20976768e+01 ...,  -6.50724605e-03
   -2.26720800e-02  -1.63165500e-02]
 [  2.80503822e+02  -1.78668059e+02   1.66664221e+01 ...,  -1.69490128e-02
   -1.87429195e-02  -1.52587512e-02]
 [  2.91118664e+02  -2.60388204e+02  -1.20729824e+01 ...,  -2.09813820e-02
    8.82380439e-03   1.44782537e-02]
 ..., 
 [  2.39653428e+02  -1.65445924e+02   1.29260450e+02 ...,  -2.29214794e-02
   -7.98279378e-03   1.80802623e-02]
 [  2.27841783e+02  -5.52339667e+01  -1.25731133e+02 ...,  -8.95776674e-03
   -3.34333693e-03   8.43251761e-

array([[  2.88205714e+02,  -4.94060282e+01],
       [  2.80503822e+02,  -1.78668059e+02],
       [  2.91118664e+02,  -2.60388204e+02],
       [ -7.83956514e+02,  -1.48850386e+02],
       [ -7.75364676e+02,  -1.83860086e+02],
       [ -7.72743952e+02,  -1.82555369e+02],
       [ -8.02633227e+02,  -1.74305096e+02],
       [ -8.05849281e+02,  -1.76712222e+02],
       [ -4.27424965e+02,  -1.56019076e+02],
       [ -4.31385794e+02,  -7.16495944e+01],
       [ -4.20558410e+02,  -5.57022629e+01],
       [ -3.29433369e+02,   2.39621087e+02],
       [ -3.04234510e+02,   2.30730870e+02],
       [ -3.13294112e+02,   2.09732707e+02],
       [ -3.24042146e+02,   1.49894457e+02],
       [ -3.21556603e+02,   1.02984813e+02],
       [ -3.25720260e+02,   1.03774164e+02],
       [ -3.05042751e+02,  -3.44523081e+01],
       [ -3.08178492e+02,   7.65020572e+01],
       [ -3.55317164e+02,   1.10561189e+02],
       [ -3.28419547e+02,   1.17428837e+02],
       [ -8.79459056e+02,  -2.02636234e+02],
       [ -