In [1]:
import numpy as np
from datetime import timedelta, date, datetime
import pandas as pd


In [2]:
df=pd.read_csv('RandomDFTraits.csv.gz',compression='gzip')

In [3]:
current=datetime.now()
def add_days_birth(row):
    date=pd.to_datetime(row['DateTime'])
    diff=current-date
    return diff.days

In [4]:
df['DaysBirth']=df.apply(lambda row: add_days_birth(row), axis=1)

In [5]:
df.to_csv('RandomDFTraits.csv.gz', compression='gzip')

In [6]:
df.head()

Unnamed: 0,DateTime,Sun,Moon,Mercury,Venus,Mars,Jupiter,Saturn,Uranus,Neptune,Independent1,Indpendent2,Loosely1,Loosely2,Strongly1,Strongly2,DaysBirth
0,1820-01-01,10,4,9,10,5,11,12,9,9,0.605985,0.786374,6.195561,39.052275,9,19.512277,74650
1,1820-01-01,10,4,9,10,5,11,12,9,9,0.88994,1.248274,9.70608,94.065416,3,7.446833,74650
2,1820-01-01,10,4,9,11,5,11,12,9,9,0.934184,2.68655,4.039443,9.062088,5,11.724304,74650
3,1820-01-01,10,4,9,11,5,11,12,9,9,0.327426,4.438963,3.914944,-1.209793,7,15.782803,74650
4,1820-01-01,10,4,9,11,5,11,12,9,9,0.086337,3.508798,8.693338,74.19089,6,13.214151,74650


In [7]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error


## Predicting from all planets

In [16]:
X=df[['Sun','Moon','Mercury','Venus','Mars','Jupiter','Saturn','Uranus','Neptune']]
y=df['DaysBirth']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
reg=DecisionTreeRegressor(max_depth=16).fit(X_train,y_train)

In [17]:
mean_squared_error(y_test,reg.predict(X_test))

4.765919972522525

In [18]:
mean_squared_error(y_train,reg.predict(X_train))

4.694620597589843

In [19]:
reg.feature_importances_

array([2.44312699e-06, 3.92671331e-08, 2.61514513e-06, 3.59972864e-06,
       2.41167168e-05, 3.00342425e-02, 1.85723599e-01, 4.60720561e-01,
       3.23488783e-01])

## Hyper Parameter Tuning

In [20]:
##Hyper parameter tuning
param_grid = {
    'max_depth': np.arange(10,20),
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [ '1.0', 'sqrt', 'log2']
}

In [21]:
model=DecisionTreeRegressor()
tuned_model = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
tuned_model.fit(X_train,y_train)


KeyboardInterrupt: 

In [None]:
best_params = tuned_model.best_params_
best_score = tuned_model.best_score_

In [None]:
best_model = tuned_model.best_estimator_
mean_squared_error(y_test,best_model.predict(X_test))

In [None]:
print(best_params)

## Testing error for different combinations of planets

### random combos

In [37]:
combos=[['Uranus', 'Neptune'],['Saturn', 'Uranus', 'Neptune'], ['Jupiter','Saturn', 'Uranus', 'Neptune' ], ['Sun', 'Mercury', 'Venus'],
        ['Sun', 'Mercury', 'Venus', 'Moon'], ['Moon', 'Mars', 'Jupiter'], ['Moon', 'Mercury', 'Jupiter','Neptune'], ['Sun','Moon','Mercury','Venus','Mars','Jupiter','Saturn','Uranus','Neptune'],
       ['Sun','Moon','Mercury','Venus','Mars','Jupiter','Saturn','Uranus','Neptune']]
error={}

In [None]:
for combo in combos:
    X=df[combo]
    y=df['DaysBirth']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    reg=DecisionTreeRegressor(max_depth=17).fit(X_train,y_train)
    joined=" ".join(combo)
    error[joined]=mean_squared_error(y_test,reg.predict(X_test))
print (error)

### Removing one at a time

In [54]:
full=['Sun','Moon','Mercury','Venus','Mars','Jupiter','Saturn','Uranus','Neptune']
error2={}
len(full)

9

In [55]:
for i in range(9):
    new_arr = full[:i] + full[i+1:]
    X=df[new_arr]
    y=df['DaysBirth']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    reg=DecisionTreeRegressor(max_depth=17).fit(X_train,y_train)
    joined=" ".join(new_arr)
    missing=full[i]
    error2[missing]=mean_squared_error(y_test,reg.predict(X_test))

In [41]:
print (error2)

{'Uranus Neptune': 312667021.4912211, 'Saturn Uranus Neptune': 6093428.867743207, 'Jupiter Saturn Uranus Neptune': 10724.025876939895, 'Sun Mercury Venus': 456939178.79797864, 'Sun Mercury Venus Moon': 439401563.38403255, 'Moon Mars Jupiter': 427288370.2895509, 'Moon Mercury Jupiter Neptune': 210769734.767343, 'Sun Moon Mercury Venus Mars Jupiter Saturn Uranus Neptune': 1.541871975680351}


### Removing one at a time wo Mercury and Venus

In [57]:
NoMercuryVenus=['Sun','Moon','Mars','Jupiter','Saturn','Uranus','Neptune']
error3={}
for i in range(7):
    new_arr = NoMercuryVenus[:i] + NoMercuryVenus[i+1:]
    X=df[new_arr]
    y=df['DaysBirth']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    reg=DecisionTreeRegressor(max_depth=17).fit(X_train,y_train)
    joined=" ".join(new_arr)
    missing=NoMercuryVenus[i]
    error3[missing]=mean_squared_error(y_test,reg.predict(X_test))

In [58]:
print (error3)

{'Sun': 918.9713156209515, 'Moon': 47.82121096144048, 'Mars': 1802.0074800160098, 'Jupiter': 11532.717007274648, 'Saturn': 10257794.382123677, 'Uranus': 16.183034683277704, 'Neptune': 25.19716354858993}


### Removing one at a time wo Mercury, Venus and Uranus

In [61]:
NoMerVU=['Sun','Moon','Mars','Jupiter','Saturn','Neptune']
error4={}
for i in range(6):
    new_arr = NoMerVU[:i] + NoMerVU[i+1:]
    X=df[new_arr]
    y=df['DaysBirth']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    reg=DecisionTreeRegressor(max_depth=17).fit(X_train,y_train)
    joined=" ".join(new_arr)
    missing=NoMerVU[i]
    error4[missing]=mean_squared_error(y_test,reg.predict(X_test))

In [62]:
print (error4)

{'Sun': 927.6699378606401, 'Moon': 48.55265138321328, 'Mars': 76764.5388425026, 'Jupiter': 11847855.230189698, 'Saturn': 40469893.685300454, 'Neptune': 31655874.29302083}


### Removing one at a time wo Mercury, Venus, Uranus, and Neptune

In [64]:
NoMMerVU=['Sun','Mars','Jupiter','Saturn','Neptune']
error5={}
for i in range(5):
    new_arr = NoMMerVU[:i] + NoMMerVU[i+1:]
    X=df[new_arr]
    y=df['DaysBirth']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    reg=DecisionTreeRegressor(max_depth=17).fit(X_train,y_train)
    joined=" ".join(new_arr)
    missing=NoMMerVU[i]
    error5[missing]=mean_squared_error(y_test,reg.predict(X_test))

In [65]:
print (error5)

{'Sun': 1105.349755000284, 'Mars': 213117.84787207659, 'Jupiter': 16532122.700493826, 'Saturn': 39067955.824230604, 'Neptune': 32594058.996773854}


### BEST MODEL FOR NUMBER OF PLANETS

In [66]:
X=df[['Sun','Mars','Jupiter','Saturn','Neptune']]
y=df['DaysBirth']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
reg=DecisionTreeRegressor(max_depth=17).fit(X_train,y_train)
mean_squared_error(y_test,reg.predict(X_test))  

49.604758664197306