# UFO sightings

## Data

The original data are reports of 80+k UFO sightings from more than 50 years, originally obtained from [here](https://github.com/planetsig/ufo-reports). We are already familiar with this data because we used it to practice different data preparation and dimensionality reduction techniques.

### Final data

In this exercise, we should use dataset from the earlier dimensionality reduction exercise in Week 4 Day 1. If you didn't export the table back then, feel free to take our pre-prepared dataset from [here](https://drive.google.com/file/d/1Q0gj7_DK2Sz-se8hf5-luu2GESDCsndb/view?usp=sharing).

In [603]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import GridSearchCV
import seaborn as sns

import statsmodels.api as sm
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [604]:
data = pd.read_csv('df_prepared.csv', header = 0)
data['ast_dangerous'] = np.where((data['ast_is_dangerous']==True) | (data['ast_is_dangerous']=='True'),int(1),data['ast_is_dangerous'])
data['ast_dangerous'] = np.where((data['ast_is_dangerous']==False) | (data['ast_is_dangerous']=='False'),int(0),data['ast_dangerous'])
data['ast_dangerous'] = np.where((data['ast_is_dangerous']=='0.0'),int(0),data['ast_dangerous'])
data['ast_is_dangerous'] = (data['ast_dangerous']).astype(int)
data.drop('ast_dangerous',axis = 1, inplace = True)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [698]:
data = pd.read_csv('UFO_for_PCA.csv', header = 0)

In [699]:
data['precip'] = np.where(data['precipType']=='None',0,1)

In [700]:
shape_1 = ['light']
shape_2 = ['cylinder','cigar','rectangle', 'chevron','formation', 'delta', 'changing', 'egg', 'diamond','flash', 'teardrop', 'cone', 'cross', 'pyramid', 'round','crescent', 'flare', 'hexagon', 'changed']
shape_3 = ['circle','fireball','triangle']
shape_4 = ['sphere', 'disk','unknown', 'oval', 'other']

In [701]:
data['shapes'] = np.where(np.isin(data['shape'],shape_1),0,data['shape'])
data['shapes'] = np.where(np.isin(data['shape'],shape_2),1,data['shapes'])
data['shapes'] = np.where(np.isin(data['shape'],shape_3),2,data['shapes'])
data['shapes'] = np.where(np.isin(data['shape'],shape_4),3,data['shapes'])

In [702]:
data['shape'] = data['shapes'].astype(int)

In [703]:
data.drop(['Unnamed: 0','ast_orbiting_body','shapes','precipType'], axis =1, inplace = True)

In [706]:
data

Unnamed: 0,shape,duration_seconds,lat,lng,cloudCover,humidity,precipIntensity,precipProbability,pressure,temperature,visibility,windBearing,windSpeed,ast_estimated_diameter,ast_miss_distance,ast_relative_velocity,month,precip
0,1,2700.0,29.883056,-97.941111,0.00,0.730000,0.0,0.0,1009.050000,25.860000,16.09,154.0,4.89,0.297879,42621696.0,13778.372043,10,0
1,0,7200.0,29.384210,-98.581082,0.00,0.770000,0.0,0.0,1008.810000,26.120000,16.09,135.0,6.60,0.297879,42621696.0,13778.372043,10,0
2,2,20.0,53.200000,-2.916667,0.75,0.840000,0.0,0.0,1019.000000,15.560000,2.90,190.0,2.24,0.297879,42621696.0,13778.372043,10,1
3,2,20.0,28.978333,-96.645833,0.12,0.710000,0.0,0.0,1020.640000,22.680000,16.09,136.0,2.75,0.297879,42621696.0,13778.372043,10,1
4,0,900.0,21.418056,-157.803611,0.63,0.770000,0.0,0.0,1015.330000,25.740000,16.09,80.0,3.60,0.297879,42621696.0,13778.372043,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78781,0,300.0,40.693611,-75.190556,0.00,0.660547,0.0,0.0,1016.819541,13.496951,16.09,190.0,2.24,0.056760,3610758.0,32868.357670,9,0
78782,2,300.0,47.483056,-122.215833,0.00,0.660547,0.0,0.0,1016.819541,13.496951,16.09,190.0,2.24,0.056760,3610758.0,32868.357670,9,0
78783,2,10.0,38.232500,-122.635556,0.00,0.660547,0.0,0.0,1016.819541,13.496951,16.09,190.0,2.24,0.056760,3610758.0,32868.357670,9,0
78784,3,180.0,40.499167,-74.399444,0.00,0.660547,0.0,0.0,1016.819541,13.496951,16.09,190.0,2.24,0.056760,3610758.0,32868.357670,9,0


### Regression Task
 
Predict the **duration_in_seconds** of the UFO sighting given the predictors in the dataset.
- Use Lasso and Ridge regression and find optimal **alpha** using GridSearch method.

### PCA only

In [611]:
#features = data[['pca_1','pca_2','pca_3','pca_4']]
#targets = data['duration_seconds']

### All independent variables

In [801]:
features = data.drop('duration_seconds', axis = 1)
targets = data['duration_seconds']

In [804]:
data.head()

Unnamed: 0,shape,duration_seconds,lat,lng,cloudCover,humidity,precipIntensity,precipProbability,pressure,temperature,visibility,windBearing,windSpeed,ast_estimated_diameter,ast_miss_distance,ast_relative_velocity,month,precip
0,1,2700.0,29.883056,-97.941111,0.0,0.73,0.0,0.0,1009.05,25.86,16.09,154.0,4.89,0.297879,42621696.0,13778.372043,10,0
1,0,7200.0,29.38421,-98.581082,0.0,0.77,0.0,0.0,1008.81,26.12,16.09,135.0,6.6,0.297879,42621696.0,13778.372043,10,0
2,2,20.0,53.2,-2.916667,0.75,0.84,0.0,0.0,1019.0,15.56,2.9,190.0,2.24,0.297879,42621696.0,13778.372043,10,1
3,2,20.0,28.978333,-96.645833,0.12,0.71,0.0,0.0,1020.64,22.68,16.09,136.0,2.75,0.297879,42621696.0,13778.372043,10,1
4,0,900.0,21.418056,-157.803611,0.63,0.77,0.0,0.0,1015.33,25.74,16.09,80.0,3.6,0.297879,42621696.0,13778.372043,10,1


In [None]:
x1 = features.values
y1 = targets.values

### without PCA

In [488]:
#features = data.drop(['pca_1','pca_2','pca_3','pca_4','duration_seconds'], axis = 1)
#targets = data['duration_seconds']

### splitting into two sets

In [798]:
x1 = features.values
y1 = targets.values

In [800]:
x1.head

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [778]:
y1.shape

(78786,)

# Rdige regression using RidgeCV

In [711]:
ridge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1,10,20,30,40,45,50,60,70,80,90,100,573,51234121,100000000]).fit(x1, y1)

In [712]:
ridge.alpha_

573.0

In [713]:
ridge.score(x1, y1)

-0.18647642124324437

In [714]:
ridge.coef_

array([-2.59670388e+01, -2.91843494e+00, -8.51578469e-01, -2.99147826e+01,
       -1.13247551e+01,  3.79137763e+01, -6.84815480e+01, -6.80441366e-01,
       -2.33667415e+00, -1.14428063e+01, -3.14639166e-02, -6.26275092e+00,
       -1.82711930e+01,  3.54647636e-05, -2.67461321e-04, -3.39907922e-01,
        1.27455564e+01])

# Lasso regression using LassoCV

## higher the alpha value, more restriction on the coefficients; 

## low alpha > more generalization, coefficients are barely restricted and in this case linear and ridge regression resembles

In [715]:
lasso = LassoCV(cv = 5,random_state=0)

In [716]:
lasso_res = lasso.fit(x1,y1)

In [717]:
lasso.score(x1,y1)

0.0

In [718]:
lasso.alpha_

177806659.6621479

In [719]:
lasso.coef_

array([-0., -0., -0., -0.,  0.,  0.,  0.,  0., -0., -0., -0., -0., -0.,
       -0., -0.,  0., -0.])

# Lasso

In [771]:
alphas = np.logspace(-5, 50, 50)
parameter_candidates = [{'alpha':alphas}]
n_folds = 10

In [772]:
lasso = Lasso(random_state=0)

In [773]:
clf = GridSearchCV(estimator=lasso, param_grid=parameter_candidates, cv = n_folds)

In [774]:
clf.fit(x1, y1)

GridSearchCV(cv=10, estimator=Lasso(random_state=0),
             param_grid=[{'alpha': array([1.00000000e-05, 1.32571137e-04, 1.75751062e-03, 2.32995181e-02,
       3.08884360e-01, 4.09491506e+00, 5.42867544e+01, 7.19685673e+02,
       9.54095476e+03, 1.26485522e+05, 1.67683294e+06, 2.22299648e+07,
       2.94705170e+08, 3.90693994e+09, 5.17947468e+10, 6.86648845e+11,
       9.10298178e+12, 1.20679264e+14, 1...
       8.68511374e+21, 1.15139540e+23, 1.52641797e+24, 2.02358965e+25,
       2.68269580e+26, 3.55648031e+27, 4.71486636e+28, 6.25055193e+29,
       8.28642773e+30, 1.09854114e+32, 1.45634848e+33, 1.93069773e+34,
       2.55954792e+35, 3.39322177e+36, 4.49843267e+37, 5.96362332e+38,
       7.90604321e+39, 1.04811313e+41, 1.38949549e+42, 1.84206997e+43,
       2.44205309e+44, 3.23745754e+45, 4.29193426e+46, 5.68986603e+47,
       7.54312006e+48, 1.00000000e+50])}])

In [781]:
clf.best_score_

-0.0002561605068575612

In [782]:
clf.best_params_

{'alpha': 4.094915062380419}

# Ridge

In [791]:
alphas = np.logspace(-5, 50, 500)
parameter_candidates = [{'alpha':alphas}]
n_folds = 10

In [792]:
ridge = Ridge(random_state=0)

In [793]:
clf_ridge = GridSearchCV(estimator=ridge, param_grid=parameter_candidates, cv = n_folds)

In [795]:
clf_ridge.fit(x1, y1)

In [783]:
clf_ridge.best_score_

-0.00500706745863283

In [784]:
clf_ridge.best_params_

{'alpha': 995396230.9984244}

# Regression

In [767]:
lin_reg = sm.OLS(targets,temp)

In [768]:
model = lin_reg.fit()
reg = model.summary()
reg

0,1,2,3
Dep. Variable:,duration_seconds,R-squared (uncentered):,0.141
Model:,OLS,Adj. R-squared (uncentered):,0.141
Method:,Least Squares,F-statistic:,2155.0
Date:,"Tue, 18 Aug 2020",Prob (F-statistic):,0.0
Time:,01:40:11,Log-Likelihood:,-708050.0
No. Observations:,78786,AIC:,1416000.0
Df Residuals:,78780,BIC:,1416000.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
shape,-26.7813,6.014,-4.453,0.000,-38.570,-14.993
lat,-2.3090,0.718,-3.215,0.001,-3.717,-0.902
lng,-0.7819,0.195,-4.003,0.000,-1.165,-0.399
pressure,0.9986,0.049,20.413,0.000,0.903,1.094
visibility,-10.1260,2.871,-3.527,0.000,-15.754,-4.498
ast_estimated_diameter,-20.5689,8.028,-2.562,0.010,-36.304,-4.834

0,1,2,3
Omnibus:,83482.795,Durbin-Watson:,1.918
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5263111.304
Skew:,5.482,Prob(JB):,0.0
Kurtosis:,41.51,Cond. No.,1190.0


In [769]:
corr_mat = temp.corr().abs()
indices = np.where(corr_mat > 0.8) 
indices = [(df_corr.index[x], df_corr.columns[y]) for x, y in zip(*indices) if x != y and x < y]

In [765]:
import copy
temp = copy.deepcopy(features)
targets = data['duration_seconds']

In [766]:
temp.drop(['month','humidity','windBearing','ast_relative_velocity','cloudCover','precip','windSpeed','precipProbability','precipIntensity','temperature','ast_miss_distance'],axis = 1, inplace= True)