# UFO sightings

## Data

The original data are reports of 80+k UFO sightings from more than 50 years, originally obtained from [here](https://github.com/planetsig/ufo-reports). We are already familiar with this data because we used it to practice different data preparation and dimensionality reduction techniques.

### Final data

In this exercise, we should use dataset from the earlier dimensionality reduction exercise in Week 4 Day 1. If you didn't export the table back then, feel free to take our pre-prepared dataset from [here](https://drive.google.com/file/d/1Q0gj7_DK2Sz-se8hf5-luu2GESDCsndb/view?usp=sharing).

In [427]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso

In [48]:
data = pd.read_csv('df_prepared.csv', header = 0)
data['ast_dangerous'] = np.where((data['ast_is_dangerous']==True) | (data['ast_is_dangerous']=='True'),1,data['ast_is_dangerous'])
data['ast_dangerous'] = np.where((data['ast_is_dangerous']==False) | (data['ast_is_dangerous']=='False'),0,data['ast_dangerous'])
data['ast_dangerous'] = np.where((data['ast_is_dangerous']=='0.0'),0,data['ast_dangerous'])
data.drop('ast_is_dangerous',axis = 1, inplace = True)

In [225]:
data.head()

Unnamed: 0,pca_1,pca_2,pca_3,pca_4,lat,lng,ast_absolute_magnitude_h,hour,day_of_the_week,month,...,shape_other,shape_oval,shape_rectangle,shape_round,shape_sphere,shape_teardrop,shape_triangle,shape_unknown,duration_seconds,ast_dangerous
0,2.586827,-0.691122,0.383378,-0.392817,29.883056,-97.941111,21.5,20.0,0.0,10.0,...,0,0,0,0,0,0,0,0,2700.0,1
1,2.705866,-0.500283,0.805411,-0.762964,29.38421,-98.581082,21.5,21.0,0.0,10.0,...,0,0,0,0,0,0,0,0,7200.0,1
2,0.828969,3.089038,-1.496677,-2.100431,53.2,-2.916667,21.5,17.0,0.0,10.0,...,0,0,0,0,0,0,0,0,20.0,1
3,1.499912,-0.667672,-0.96277,0.447635,28.978333,-96.645833,21.5,21.0,2.0,10.0,...,0,0,0,0,0,0,0,0,20.0,1
4,2.461555,0.334459,-0.601018,-0.502296,21.418056,-157.803611,21.5,20.0,0.0,10.0,...,0,0,0,0,0,0,0,0,900.0,1


### Regression Task
 
Predict the **duration_in_seconds** of the UFO sighting given the predictors in the dataset.
- Use Lasso and Ridge regression and find optimal **alpha** using GridSearch method.

### PCA only

In [341]:
#features = data[['pca_1','pca_2','pca_3','pca_4']].values
#targets = data['duration_seconds'].values

### All independent variables

In [157]:
#features = data.drop('duration_seconds', axis = 1).values
#targets = data['duration_seconds'].values

### without PCA

In [356]:
features = data.drop(['pca_1','pca_2','pca_3','pca_4','duration_seconds'], axis = 1).values
targets = data['duration_seconds'].values

### splitting into two sets

In [371]:
x1 = features[:64000]
y1 = targets[:64000]

x2 = features[64000:]
y2 = targets[64000:]

In [372]:
x1.shape

(64000, 35)

In [373]:
y1.shape

(64000,)

# Rdige regression using RidgeCV

In [376]:
ridge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1,10,20,30,40,45,50,60,70,80,90,100,573,51234121,100000000]).fit(x1, y1)

In [377]:
ridge.alpha_

1000.0

In [362]:
ridge.score(x1, y1)

0.00010375355129210728

In [369]:
ridge.coef_

array([ 5.14690989e+01,  9.69739475e+01,  9.12612738e+00,  1.37420704e+01,
        1.87514009e+00,  4.30246407e+00,  1.32495942e+00,  2.04759937e-02,
        1.94507837e+00, -3.80993285e+00, -1.78085368e-01, -1.07233718e-01,
       -1.94202684e-01, -3.04556430e-01,  4.65971536e-01,  3.89474865e-04,
       -2.76296678e-02, -8.42007214e-02, -6.57999339e-04, -1.23569827e-01,
       -5.42974274e-01, -7.70551404e-02, -4.91021890e-01, -9.10097083e-02,
       -2.52451494e-01, -1.21137231e+00,  1.53435256e+00, -2.24968842e-01,
       -1.42945072e-01, -2.54355728e-04,  1.59465686e+00, -8.45336294e-02,
       -7.40298554e-01, -1.22480763e-01,  1.41107195e+00])

# Lasso regression using LassoCV

## higher the alpha value, more restriction on the coefficients; 

## low alpha > more generalization, coefficients are barely restricted and in this case linear and ridge regression resembles

In [428]:
las = Lasso()

In [363]:
lasso = LassoCV(cv = 5,random_state=0).fit(x1,y1)

In [364]:
lasso_res = lasso.fit(x1,y1)

In [365]:
lasso.score(x1,y1)

0.00018958273272706716

In [366]:
lasso.alpha_

10539.14915739557

In [367]:
lasso.coef_

array([446.54661127, 181.49042517,   0.        ,  23.89584787,
         0.        ,   0.        ,   0.        ,  -0.        ,
         0.        ,  -0.        ,  -0.        ,  -0.        ,
        -0.        ,  -0.        ,   0.        ,   0.        ,
        -0.        ,  -0.        ,  -0.        ,  -0.        ,
        -0.        ,  -0.        ,  -0.        ,  -0.        ,
        -0.        ,  -0.        ,   0.        ,  -0.        ,
        -0.        ,  -0.        ,   0.        ,  -0.        ,
        -0.        ,  -0.        ,   0.        ])

In [368]:
data.head()

Unnamed: 0,pca_1,pca_2,pca_3,pca_4,lat,lng,ast_absolute_magnitude_h,hour,day_of_the_week,month,...,shape_other,shape_oval,shape_rectangle,shape_round,shape_sphere,shape_teardrop,shape_triangle,shape_unknown,duration_seconds,ast_dangerous
0,2.586827,-0.691122,0.383378,-0.392817,29.883056,-97.941111,21.5,20.0,0.0,10.0,...,0,0,0,0,0,0,0,0,2700.0,1
1,2.705866,-0.500283,0.805411,-0.762964,29.38421,-98.581082,21.5,21.0,0.0,10.0,...,0,0,0,0,0,0,0,0,7200.0,1
2,0.828969,3.089038,-1.496677,-2.100431,53.2,-2.916667,21.5,17.0,0.0,10.0,...,0,0,0,0,0,0,0,0,20.0,1
3,1.499912,-0.667672,-0.96277,0.447635,28.978333,-96.645833,21.5,21.0,2.0,10.0,...,0,0,0,0,0,0,0,0,20.0,1
4,2.461555,0.334459,-0.601018,-0.502296,21.418056,-157.803611,21.5,20.0,0.0,10.0,...,0,0,0,0,0,0,0,0,900.0,1


# Lasso

In [418]:
alphas = np.logspace(-5, 50, 500)
parameter_candidates = [{'alpha':alphas}]
n_folds = 10

In [419]:
alphas.shape

(500,)

In [420]:
lasso = Lasso(random_state=0)

In [421]:
clf = GridSearchCV(estimator=lasso, param_grid=parameter_candidates, cv = n_folds)

In [422]:
clf.fit(x1, y1)

GridSearchCV(cv=10, estimator=Lasso(random_state=0),
             param_grid=[{'alpha': array([1.00000000e-05, 1.28890361e-05, 1.66127252e-05, 2.14122015e-05,
       2.75982639e-05, 3.55715021e-05, 4.58482375e-05, 5.90939590e-05,
       7.61664172e-05, 9.81711702e-05, 1.26533176e-04, 1.63089068e-04,
       2.10206088e-04, 2.70935387e-04, 3.49209598e-04, 4.50097513e-04,
       5.80132310e-04, 7.47734631e-04, 9...
       2.91695588e+47, 3.75967497e+47, 4.84585864e+47, 6.24584471e+47,
       8.05029181e+47, 1.03760502e+48, 1.33737286e+48, 1.72374471e+48,
       2.22174078e+48, 2.86360972e+48, 3.69091691e+48, 4.75723614e+48,
       6.13161884e+48, 7.90306568e+48, 1.01862899e+49, 1.31291459e+49,
       1.69222035e+49, 2.18110892e+49, 2.81123917e+49, 3.62341632e+49,
       4.67023439e+49, 6.01948197e+49, 7.75853206e+49, 1.00000000e+50])}])

In [423]:
clf.best_score_

-0.0062267990595622266

In [426]:
clf.best_estimator_

Lasso(alpha=229467.6367231941, random_state=0)

In [425]:
clf.best_params_

{'alpha': 229467.6367231941}

In [332]:
clf.cv_results_['param_alpha']

masked_array(data=[1e-05, 2.592943797404667e-05, 6.723357536499335e-05,
                   0.00017433288221999874, 0.00045203536563602405,
                   0.0011721022975334804, 0.0030391953823131978,
                   0.007880462815669913, 0.020433597178569417,
                   0.05298316906283707, 0.1373823795883264,
                   0.35622478902624444, 0.9236708571873865,
                   2.395026619987486, 6.2101694189156165,
                   16.102620275609393, 41.753189365604, 108.2636733874054,
                   280.72162039411756, 727.8953843983146,
                   1887.3918221350996, 4893.900918477499,
                   12689.610031679234, 32903.44562312671,
                   85316.78524172814, 221221.629107045, 573615.2510448681,
                   1487352.1072935117, 3856620.4211634723, 10000000.0],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
         