## Load necessary modules

In [2]:
!pip install sklearn_pandas

Collecting sklearn_pandas
  Downloading https://files.pythonhosted.org/packages/1f/48/4e1461d828baf41d609efaa720d20090ac6ec346b5daad3c88e243e2207e/sklearn_pandas-1.8.0-py2.py3-none-any.whl
Installing collected packages: sklearn-pandas
Successfully installed sklearn-pandas-1.8.0


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import Pipeline

## Load necessary data

In [5]:
drugs = pd.read_csv("raw_data/drug-use-by-age.csv")
drugs.head()

Unnamed: 0,age,n,alcohol-use,alcohol-frequency,marijuana-use,marijuana-frequency,cocaine-use,cocaine-frequency,crack-use,crack-frequency,...,oxycontin-use,oxycontin-frequency,tranquilizer-use,tranquilizer-frequency,stimulant-use,stimulant-frequency,meth-use,meth-frequency,sedative-use,sedative-frequency
0,12,2798,3.9,3.0,1.1,4.0,0.1,5.0,0.0,-,...,0.1,24.5,0.2,52.0,0.2,2.0,0.0,-,0.2,13.0
1,13,2757,8.5,6.0,3.4,15.0,0.1,1.0,0.0,3.0,...,0.1,41.0,0.3,25.5,0.3,4.0,0.1,5.0,0.1,19.0
2,14,2792,18.1,5.0,8.7,24.0,0.1,5.5,0.0,-,...,0.4,4.5,0.9,5.0,0.8,12.0,0.1,24.0,0.2,16.5
3,15,2956,29.2,6.0,14.5,25.0,0.5,4.0,0.1,9.5,...,0.8,3.0,2.0,4.5,1.5,6.0,0.3,10.5,0.4,30.0
4,16,3058,40.1,10.0,22.5,30.0,1.0,7.0,0.0,1.0,...,1.1,4.0,2.4,11.0,1.8,9.5,0.3,36.0,0.2,3.0


## Clean `age` such that it is an integer

In [6]:
drugs["age"] = drugs["age"].str[0:2].astype(int)

## Train Test Split Data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(drugs.drop(["age"],axis=1),
                                                    drugs["age"],
                                                    random_state=624,
                                                    test_size=0.3)

## Note about EDA

Traditionally, it is best practice to visually explore the contents of 
`X_train` through a process knows as Exploratory Data Analysis (EDA). 
EDA will show you distributions, anomalies, and trends your data has. This
will ultimately lead you to a checklist of preprocessing steps before you
begin modeling:
    
* outliers that need to be removed/impute
* features that need to be created (ie. feature engineering)
* numerical features that need to be scaled
* categolircal features that need to be OHE 

## Let's create a `mapper` object that stores our preprocessing steps

In [12]:
mapper = DataFrameMapper([
    (["n", "alcohol-use"], StandardScaler()),
])

## Let's create a `Pipeline`object that will apply our preprocessing steps before the data goes into our model

In [13]:
pipe = Pipeline(steps=[
    ("preprocessing", mapper),
    ("linear_reg", LinearRegression())
])

## Let's fit `X_train`and `Y_train` onto our pipeline

In [14]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 DataFrameMapper(default=False, df_out=False,
                                 features=[(['n', 'alcohol-use'],
                                            StandardScaler(copy=True,
                                                           with_mean=True,
                                                           with_std=True))],
                                 input_df=False, sparse=False)),
                ('linear_reg',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

In [16]:
y_pred = pipe.predict(X_test)
y_pred

array([18.30356752, 22.94562821, 19.68924863, 22.68341963, 21.11594882,
       15.40749093])

In [17]:
y_test

16    65
12    26
6     18
14    35
15    50
3     15
Name: age, dtype: int64