## Assignment 1: Build a Predictor
## Dhruval Bhatt (Collaborated with Keertana Chidambaram)

Instructions from assignment:

Build a predictor . Once you have built a predictor, you should use the test data set to submit a
csv file on canvas. That csv file should only have two columns: diag_id, y_hat.

y_hat = drinks per day = 'U1031900'
diag_id = 'diag.id'

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

In [2]:
# Load the data given

train_df = pd.read_csv('../Data/nlsy training set.csv')
test_df = pd.read_csv('../Data/nlsy test set.csv')

print("Size of Training Set", train_df.shape)
print("Size of Test Set", test_df.shape)

train_df.drop(train_df.columns[[0]], axis=1, inplace=True)  #remove unnamed column that is the repeat of id column in train
test_df.drop(test_df.columns[[0]], axis=1, inplace=True)#remove unnamed column that is the repeat of id column in test

print("Size of Training Set after Removing Col 1", train_df.shape)
print("Size of Test Set after Removing Col 1", test_df.shape)

Size of Training Set (7187, 4887)
Size of Test Set (1797, 4886)
Size of Training Set after Removing Col 1 (7187, 4886)
Size of Test Set after Removing Col 1 (1797, 4885)


In [3]:
train_df.set_index('diag.id', inplace=True)

### Data Prep

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
# Remove rows that have missing values for the target data
train_df = train_df[train_df['U1031900'] >= 0] #subset

In [6]:
train_X = train_df.loc[:, train_df.columns != 'U1031900']
print("Size of Training Set X variables", train_X.shape)

Size of Training Set X variables (3545, 4884)


In [7]:
train_y = train_df['U1031900']
print("Size of Training Set y variables", train_y.shape)

Size of Training Set y variables (3545,)


In [8]:
# Split into train and test 
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.2, random_state=0)

In [9]:
X_train.shape

(2836, 4884)

In [10]:
y_test.shape

(709,)

In [11]:
print("The variance of y_all:", train_df['U1031900'].var())
print("The variance of y_train:", y_train.var())
print("The variance of y_test:", y_test.var())

The variance of y_all: 15.280341115678567
The variance of y_train: 14.793511864340045
The variance of y_test: 17.250743069334526


## Feature Selection with Lasso (Applied to RF and XBG model)

In [25]:
from sklearn.linear_model import Lasso, LogisticRegression, LassoCV
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train.fillna(0))

clf = Lasso(alpha=0.001)

#sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1'))
sel_ = SelectFromModel(clf, threshold=0.25)
sel_.fit(scaler.transform(X_train.fillna(0)), y_train)



SelectFromModel(estimator=Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
        norm_order=1, prefit=False, threshold=0.25)

In [26]:
selected_feat = X_train.columns[(sel_.get_support())]
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

total features: 4884
selected features: 1112
features with coefficients shrank to zero: 1955


In [27]:
np.sum(sel_.estimator_.coef_ == 0)
selected_feat

X_train_selected = sel_.transform(X_train.fillna(0))
X_test_selected = sel_.transform(X_test.fillna(0))

X_train_selected.shape, X_test_selected.shape

((2836, 1112), (709, 1112))

In [16]:
selected_feat

Index(['E5270800'], dtype='object')

In [None]:
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# feature extraction
model = LogisticRegression(solver='lbfgs')
rfe = RFE(model, 3)
fit = rfe.fit(X_train, y_train)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

In [None]:
feature_idx

### Prelimnary Models: Decision Tree, no tuning and Feature Importance

In [29]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import make_scorer
import numpy

  from numpy.core.umath_tests import inner1d


In [None]:
# Decision Tree

model_1 = tree.DecisionTreeClassifier()
model_1.fit(X_train, y_train)
pred_1 = model_1.predict(X_test)
mse_1 = mean_squared_error(y_test, pred_1)
print('MSE: %f' % mse_1)

In [30]:
# Decision Tree

model_3 = tree.DecisionTreeClassifier()
model_3.fit(X_train_selected, y_train)
pred_3 = model_3.predict(X_test_selected)
mse_3 = mean_squared_error(y_test, pred_3)
print('MSE: %f' % mse_3)

MSE: 27.550071


In [None]:
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model_1.feature_importances_, index= X_train.columns)
#feat_importances.nlargest(10).plot(kind='barh')
#plt.show()

selected_features_1 = list(feat_importances.nlargest(2000).index)

In [None]:
# Decision Tree with Selected (Just kept the top 2000 predictors)

X_train_sel = X_train.filter(selected_features_1)
X_test_sel = X_test.filter(selected_features_1)

model_2 = tree.DecisionTreeClassifier()
model_2.fit(X_train_sel, y_train)
pred_2 = model_2.predict(X_test_sel)
mse_2 = mean_squared_error(y_test, pred_2)
print('MSE: %f' % mse_2)

### Ensemble Meta Methods Applied - RF, XGB

In [None]:
# Random Forest Decision Tree

rf_c = RandomForestClassifier(max_depth=10, min_samples_split=10, min_samples_leaf=10, 
                                 n_estimators=25, max_features=4000, random_state=0)
rf_c.fit(X_train, y_train)
pred_rfc = rf_c.predict(X_test)

mean_squared_error(y_test, pred_rfc)

In [None]:
#XGB Regressor
# xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
#                 max_depth = 5, alpha = 10, n_estimators = 10)
xgb_r = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 15, random_state=0)
xgb_r.fit(X_train,y_train)
pred_xgbr = xgb_r.predict(X_test)
pred_xgbr = numpy.around(pred_xgbr)
mean_squared_error(y_test, pred_xgbr)

## Feature Selection with Lasso (Applied to RF and XBG model)

In [32]:
# RFC
fs_rf_c = RandomForestClassifier(max_depth=10, min_samples_split=10, min_samples_leaf=10, 
                                 n_estimators=25, random_state=0)
fs_rf_c.fit(X_train_selected, y_train)
fs_pred_rfc = fs_rf_c.predict(X_test_selected)

mean_squared_error(y_test, fs_pred_rfc)

18.64174894217207

In [33]:
#XGB Regressor - Selected 
# xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
#                 max_depth = 5, alpha = 10, n_estimators = 10)
fs_xgb_r = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 15, random_state=0)
fs_xgb_r.fit(X_train_selected,y_train)

fs_pred_xgbr = fs_xgb_r.predict(X_test_selected)
fs_pred_xgbr = numpy.around(fs_pred_xgbr)

mean_squared_error(y_test, fs_pred_xgbr)

  if getattr(data, 'base', None) is not None and \


16.88293370944993

## Ensemble Method 

## Apply Final Model to Given Test Data

In [None]:
## Apply Final Model to Test Data: 


In [None]:
#To DO: 
# Add - RF Regressor 
# Neural Network
# Linear Regression 
# SVM
# Ensemble 
#Final Results