# 1. Imports

In [1]:
#General
import pandas as pd
import numpy as np

#Visualization
import matplotlib.pyplot as plt

#Preprocessing
from sklearn.model_selection        import train_test_split
from sklearn.preprocessing          import StandardScaler
from sklearn                        import preprocessing

#Metrics
from sklearn.metrics                import r2_score, mean_squared_error
from sklearn.model_selection        import cross_val_score, cross_val_predict
from sklearn.metrics                import confusion_matrix
from sklearn.metrics                import explained_variance_score

#Models
from sklearn.linear_model           import LinearRegression
from sklearn                        import linear_model
from sklearn.linear_model           import Ridge
from sklearn.tree                   import DecisionTreeRegressor
from sklearn.svm                    import SVR
from sklearn.ensemble               import RandomForestRegressor

# 2. Load Data

In [2]:
#Load the file generated from EDA.
file = 'Asteroid_EDA_Clean.csv'
df = pd.read_csv(file, sep=',', index_col=0)

In [3]:
#Verify that the data loaded correctly.
df.head().T

Unnamed: 0,0,1,2,3,4
semi-major_axis(au),2.76917,2.77247,2.66915,2.36142,2.57425
eccentricity,0.076009,0.230337,0.256942,0.0887215,0.191095
x-y_inclination(deg),10.5941,34.8362,12.9889,7.14177,5.36699
longitude_asc_node,80.3055,173.08,169.853,103.811,141.577
argument_perihelion,73.5977,310.049,248.139,150.729,358.688
perihelion_dist(au),2.55868,2.13386,1.98333,2.15191,2.08232
aphelion_dist(au),2.97965,3.41107,3.35497,2.57093,3.06617
data_arc(d),8822,72318,72684,24288,63507
n_obs_used,1002,8490,7104,9325,2916
diameter,939.4,545,246.596,525.4,106.699


# 3. Feature Engineering

In [4]:
#Check for categorical variables.
df.select_dtypes(include=['object']).head().T

Unnamed: 0,0,1,2,3,4
class,MBA,MBA,MBA,MBA,MBA


In [5]:
#Create dummy variables for class.
df = pd.get_dummies(df, columns=['class'])

In [6]:
df.columns

Index(['semi-major_axis(au)', 'eccentricity', 'x-y_inclination(deg)',
       'longitude_asc_node', 'argument_perihelion', 'perihelion_dist(au)',
       'aphelion_dist(au)', 'data_arc(d)', 'n_obs_used', 'diameter',
       'mean_motion(deg/d)', 'orbital_period(d)', 'mean_anomaly(deg)',
       'class_AMO', 'class_APO', 'class_AST', 'class_ATE', 'class_CEN',
       'class_IMB', 'class_MBA', 'class_MCA', 'class_OMB', 'class_TJN',
       'class_TNO'],
      dtype='object')

# 4. Standarize Data and Split into Training and Test Sets

In [7]:
#Split data into features and target.
y = df['diameter']
X = df.drop(columns = 'diameter')

In [8]:
#Create training and test data.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [9]:
#Standarize the features

scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)

X_test_scaled = scaler.transform(X_test)

# 5. Initial Models

### 5.1 Linear Regression

In [12]:
#Initial Fit
lm = LinearRegression()
lm.fit(X_train, y_train)
lm_score = lm.score(X_train, y_train)
lm_pred = lm.predict(X_test)

print(lm_score)

0.45267188148068405


In [15]:
lm_mse = mean_squared_error(y_test, lm_pred)
lm_rmse = np.sqrt(lm_mse)
lm_score_test = lm.score(X_test, y_test)

print('Test Score:, ', lm_score_test)
print('MSE: ',round(lm_mse,2))
print('RMSE: ',round(lm_rmse,2))
print('Explained Variance Score: ',round(explained_variance_score(y_test, lm_pred),2))

Test Score:,  0.3008869955408313
MSE:  68.58
RMSE:  8.28
Explained Variance Score:  0.3


In [44]:
#Cross Validation
cvx, cvxt, cvy, cvyt = train_test_split(X, y, test_size = 0.2)
lm = LinearRegression()
scores = cross_val_score(lm, X_train, y_train, cv = 5)
print('Scores: ',scores)
print('Mean Scores:',np.mean(scores))
print('Standard Deviation of Scores: ',np.std(scores))

Scores:  [0.3860002  0.4965816  0.40301491 0.42120102 0.42772953]
Mean Scores: 0.4269054516407055
Standard Deviation of Scores:  0.037767665763228084


### Lasso Regression

In [16]:
lasso = linear_model.Lasso()
lasso.fit(X_train, y_train)
lasso_score = lasso.score(X_train, y_train)
lasso_pred = lasso.predict(X_test)

print(lasso_score)

0.3581969631181513


In [39]:
lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_rmse = np.sqrt(lasso_mse)

print('MSE: ',round(lasso_mse,2))
print('RMSE: ',round(lasso_rmse,2))
print('Explained Variance Score: ',round(explained_variance_score(y_test, lasso_pred),2))

MSE:  71.62
RMSE:  8.46
Explained Variance Score:  0.27


### Ridge Regression

In [17]:
ridge = Ridge()
ridge.fit(X_train, y_train)
ridge_score = lasso.score(X_train, y_train)
ridge_pred = ridge.predict(X_test)

print(ridge_score)

0.3581969631181513


In [31]:
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_rmse = np.sqrt(ridge_mse)

print(ridge_mse)
print(ridge_rmse)

68.58347970183819
8.281514336269556


### Decision Tree Regression

In [18]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr_score = dtr.score(X_train, y_train)
dtr_pred = dtr.predict(X_test)

print(dtr_score)

1.0


In [20]:
dtr_score_test = dtr.score(X_test, y_test)
print(dtr_score_test)

0.36808975421881274


In [22]:
dtr_mse = mean_squared_error(y_test, dtr_pred)
dtr_rmse = np.sqrt(dtr_mse)

print(dtr_mse)
print(dtr_rmse)

62.658323682759374
7.915701085991017


### Random Forest Regressor

In [19]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf_score = rf.score(X_train, y_train)
rf_pred = rf.predict(X_test)

print(rf_score)



0.9434294365952536


In [22]:
rf_score_test = rf.score(X_test, y_test)
print(rf_score_test)

0.5316273531330014


In [14]:
rf_mse = mean_squared_error(y_test, rf_pred)
rf_rmse = np.sqrt(rf_mse)

print(rf_mse)
print(rf_rmse)

46.568036643949796
6.824077713797653


### Support Vector Machines

In [15]:
svr = svm.SVR()
svr.fit(X_train, y_train)
svr_pred = svr.predict(X_test)



In [21]:
svr_mse = mean_squared_error(y_test, svr_pred)
svr_rmse = np.sqrt(svr_mse)

print(svr_mse)
print(svr_rmse)

100.22233898492509
10.011110776778224
