In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_regression, SelectKBest


import seaborn as sns
import os

from scipy import stats

import datetime

%matplotlib inline

In [38]:
data = pd.read_csv('UFO_cleaned.csv', header = 0, sep = ',')

In [39]:
numerical = list(data.dtypes[data.dtypes != 'object'].index)
categorical = list(data.dtypes[data.dtypes == 'object'].index)

In [40]:
categorical

['timestamp',
 'shape',
 'notes',
 'date_reported',
 'date_event',
 'icon',
 'precipType',
 'summary',
 'ast_orbiting_body']

In [41]:
data = data.drop(['Unnamed: 0','notes','date_event', 'date_reported', 'summary', 'ast_is_dangerous', 'icon'], axis = 1)

In [42]:
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['month'] = data['timestamp'].dt.month
data = data.drop('timestamp', axis = 1)
data

Unnamed: 0,shape,duration_seconds,lat,lng,cloudCover,humidity,precipIntensity,precipProbability,precipType,pressure,temperature,visibility,windBearing,windSpeed,ast_estimated_diameter,ast_miss_distance,ast_orbiting_body,ast_relative_velocity,month
0,cylinder,2700.0,29.883056,-97.941111,0.00,0.730000,0.0,0.0,,1009.050000,25.860000,16.09,154.0,4.89,0.297879,42621696.0,Earth,13778.372043,10
1,light,7200.0,29.384210,-98.581082,0.00,0.770000,0.0,0.0,,1008.810000,26.120000,16.09,135.0,6.60,0.297879,42621696.0,Earth,13778.372043,10
2,circle,20.0,53.200000,-2.916667,0.75,0.840000,0.0,0.0,rain,1019.000000,15.560000,2.90,190.0,2.24,0.297879,42621696.0,Earth,13778.372043,10
3,circle,20.0,28.978333,-96.645833,0.12,0.710000,0.0,0.0,rain,1020.640000,22.680000,16.09,136.0,2.75,0.297879,42621696.0,Earth,13778.372043,10
4,light,900.0,21.418056,-157.803611,0.63,0.770000,0.0,0.0,rain,1015.330000,25.740000,16.09,80.0,3.60,0.297879,42621696.0,Earth,13778.372043,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78781,light,300.0,40.693611,-75.190556,0.00,0.660547,0.0,0.0,,1016.819541,13.496951,16.09,190.0,2.24,0.056760,3610758.0,Earth,32868.357670,9
78782,fireball,300.0,47.483056,-122.215833,0.00,0.660547,0.0,0.0,,1016.819541,13.496951,16.09,190.0,2.24,0.056760,3610758.0,Earth,32868.357670,9
78783,circle,10.0,38.232500,-122.635556,0.00,0.660547,0.0,0.0,,1016.819541,13.496951,16.09,190.0,2.24,0.056760,3610758.0,Earth,32868.357670,9
78784,unknown,180.0,40.499167,-74.399444,0.00,0.660547,0.0,0.0,,1016.819541,13.496951,16.09,190.0,2.24,0.056760,3610758.0,Earth,32868.357670,9


In [47]:
data['shape'].unique()

array(['cylinder', 'light', 'circle', 'sphere', 'disk', 'fireball',
       'unknown', 'oval', 'other', 'cigar', 'rectangle', 'chevron',
       'triangle', 'formation', 'delta', 'changing', 'egg', 'diamond',
       'flash', 'teardrop', 'cone', 'cross', 'pyramid', 'round',
       'crescent', 'flare', 'hexagon', 'changed'], dtype=object)

In [44]:
data_dummies = pd.get_dummies(data, drop_first= True)

In [45]:
data_dummies.to_csv('UFO_with_dummies.csv')

In [46]:
data_dummies

Unnamed: 0,duration_seconds,lat,lng,cloudCover,humidity,precipIntensity,precipProbability,pressure,temperature,visibility,...,shape_oval,shape_pyramid,shape_rectangle,shape_round,shape_sphere,shape_teardrop,shape_triangle,shape_unknown,precipType_rain,precipType_snow
0,2700.0,29.883056,-97.941111,0.00,0.730000,0.0,0.0,1009.050000,25.860000,16.09,...,0,0,0,0,0,0,0,0,0,0
1,7200.0,29.384210,-98.581082,0.00,0.770000,0.0,0.0,1008.810000,26.120000,16.09,...,0,0,0,0,0,0,0,0,0,0
2,20.0,53.200000,-2.916667,0.75,0.840000,0.0,0.0,1019.000000,15.560000,2.90,...,0,0,0,0,0,0,0,0,1,0
3,20.0,28.978333,-96.645833,0.12,0.710000,0.0,0.0,1020.640000,22.680000,16.09,...,0,0,0,0,0,0,0,0,1,0
4,900.0,21.418056,-157.803611,0.63,0.770000,0.0,0.0,1015.330000,25.740000,16.09,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78781,300.0,40.693611,-75.190556,0.00,0.660547,0.0,0.0,1016.819541,13.496951,16.09,...,0,0,0,0,0,0,0,0,0,0
78782,300.0,47.483056,-122.215833,0.00,0.660547,0.0,0.0,1016.819541,13.496951,16.09,...,0,0,0,0,0,0,0,0,0,0
78783,10.0,38.232500,-122.635556,0.00,0.660547,0.0,0.0,1016.819541,13.496951,16.09,...,0,0,0,0,0,0,0,0,0,0
78784,180.0,40.499167,-74.399444,0.00,0.660547,0.0,0.0,1016.819541,13.496951,16.09,...,0,0,0,0,0,0,0,1,0,0


In [43]:
data.to_csv('UFO_for_PCA.csv')