In [1]:
from pydataset import data
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import wrangle

In [2]:
# The datasets available on pydataset
dataset_names = data().iloc[:, 0].tolist()
for name in dataset_names:
    print(name)

AirPassengers
BJsales
BOD
Formaldehyde
HairEyeColor
InsectSprays
JohnsonJohnson
LakeHuron
LifeCycleSavings
Nile
OrchardSprays
PlantGrowth
Puromycin
Titanic
ToothGrowth
UCBAdmissions
UKDriverDeaths
UKgas
USAccDeaths
USArrests
USJudgeRatings
USPersonalExpenditure
VADeaths
WWWusage
WorldPhones
airmiles
airquality
anscombe
attenu
attitude
austres
cars
chickwts
co2
crimtab
discoveries
esoph
euro
faithful
freeny
infert
iris
islands
lh
longley
lynx
morley
mtcars
nhtemp
nottem
npk
occupationalStatus
precip
presidents
pressure
quakes
randu
rivers
rock
sleep
stackloss
sunspot.month
sunspot.year
sunspots
swiss
treering
trees
uspop
volcano
warpbreaks
women
acme
aids
aircondit
aircondit7
amis
aml
bigcity
brambles
breslow
calcium
cane
capability
catsM
cav
cd4
channing
city
claridge
cloth
co.transfer
coal
darwin
dogs
downs.bc
ducks
fir
frets
grav
gravity
hirose
islay
manaus
melanoma
motor
neuro
nitrofen
nodal
nuclear
paulsen
poisons
polar
remission
salinity
survival
tau
tuna
urine
wool
aids
alloauto


### 1. Select a dataset with a continuous target variable.

In [3]:
df = data('nuclear')
df.head()

Unnamed: 0,cost,date,t1,t2,cap,pr,ne,ct,bw,cum.n,pt
1,460.05,68.58,14,46,687,0,1,0,0,14,0
2,452.99,67.33,10,73,1065,0,0,1,0,1,0
3,443.22,67.33,10,85,1065,1,0,1,0,1,0
4,652.32,68.0,11,67,1065,0,1,1,0,12,0
5,642.23,68.0,11,78,1065,1,1,1,0,12,0


In [4]:
df.nunique()

cost     32
date     19
t1       14
t2       25
cap      22
pr        2
ne        2
ct        2
bw        2
cum.n    17
pt        2
dtype: int64

In [5]:
data('nuclear', show_doc=True)

nuclear

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

##  Nuclear Power Station Construction Data

### Description

The `nuclear` data frame has 32 rows and 11 columns.

The data relate to the construction of 32 light water reactor (LWR) plants
constructed in the U.S.A in the late 1960's and early 1970's. The data was
collected with the aim of predicting the cost of construction of further LWR
plants. 6 of the power plants had partial turnkey guarantees and it is
possible that, for these plants, some manufacturers' subsidies may be hidden
in the quoted capital costs.

### Usage

    nuclear

### Format

This data frame contains the following columns:

`cost`

The capital cost of construction in millions of dollars adjusted to 1976 base.

`date`

The date on which the construction permit was issued. The data are measured in
years since January 1 1990 to the nearest month.

`t1`

The time between application for and issue of the construction pe

### 2. Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.

In [6]:
df.isnull().sum()

cost     0
date     0
t1       0
t2       0
cap      0
pr       0
ne       0
ct       0
bw       0
cum.n    0
pt       0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, 1 to 32
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   cost    32 non-null     float64
 1   date    32 non-null     float64
 2   t1      32 non-null     int64  
 3   t2      32 non-null     int64  
 4   cap     32 non-null     int64  
 5   pr      32 non-null     int64  
 6   ne      32 non-null     int64  
 7   ct      32 non-null     int64  
 8   bw      32 non-null     int64  
 9   cum.n   32 non-null     int64  
 10  pt      32 non-null     int64  
dtypes: float64(2), int64(9)
memory usage: 3.0 KB


### 3. Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

In [8]:
train, val, test = wrangle.train_val_test(df)

In [9]:
train.shape

(22, 11)

In [12]:
X_train, y_train , X_val , y_val = wrangle.X_y_split(train, val, 'cost')

In [13]:
y_train

22    665.99
29    284.88
31    217.38
26    697.14
25    473.64
18    289.66
30    280.36
16    423.32
13    412.18
32    270.71
15    394.36
4     652.32
17    712.27
23    621.45
2     452.99
28    288.48
24    608.80
6     345.39
9     457.12
8     317.21
27    207.51
14    495.58
Name: cost, dtype: float64