In [1]:
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("../bee_data/machine_learning_dataset.csv")
# Drop the null columns where all values are null
df.head()

Unnamed: 0,year,state,deadout,cc_syn,pesticides,lbs_of_honey,count_colonies,extreme_temp_days
0,2019,ALABAMA,2300,430,5.4,44000,6166,16
1,2019,ARIZONA,11500,5670,50.9,201000,27333,24
2,2019,ARKANSAS,10200,1870,113.6,176000,22333,51
3,2019,CALIFORNIA,464000,44500,39.6,3216000,830000,0
4,2019,COLORADO,17820,4600,25.5,500000,21666,136


In [3]:
df.columns

Index(['year', 'state', 'deadout', 'cc_syn', 'pesticides', 'lbs_of_honey',
       'count_colonies', 'extreme_temp_days'],
      dtype='object')

# Select your features (columns)

In [4]:
# This will also be used as your x values.
selected_features = df[['deadout', 'cc_syn', 'pesticides',
       'lbs_of_honey', 'count_colonies', 'extreme_temp_days']]

In [5]:
X = selected_features[['deadout', 'cc_syn', 'pesticides', 'count_colonies', 'extreme_temp_days']]
y = selected_features["lbs_of_honey"]
X.head()

Unnamed: 0,deadout,cc_syn,pesticides,count_colonies,extreme_temp_days
0,2300,430,5.4,6166,16
1,11500,5670,50.9,27333,24
2,10200,1870,113.6,22333,51
3,464000,44500,39.6,830000,0
4,17820,4600,25.5,21666,136


# Create a Train Test Split

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)

In [7]:
X_train.head()

Unnamed: 0,deadout,cc_syn,pesticides,count_colonies,extreme_temp_days
0,2300,430,5.4,6166,16
26,7800,2700,3.5,16333,115
2,10200,1870,113.6,22333,51
34,570,50,0.8,6833,149
38,18700,4520,33.3,41833,158


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [8]:
# Scale your data to fit data around 0
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Train the Model



In [10]:
model = LinearRegression()

In [11]:
model.fit(X, y)

LinearRegression()

In [12]:
from sklearn.metrics import mean_squared_error, r2_score

## metrics are calculated on what's being predicted
## no train test split here, just using the same dataset
predicted = model.predict(X)

## score the prediction
mse = mean_squared_error(y, predicted)
r2 = r2_score(y, predicted)

print(f'Mean Squared Error MSE: {mse}')
print(f'R2 Value: {r2}')

Mean Squared Error MSE: 1143912025866.7712
R2 Value: 0.6310640375590354


In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [21]:
X_train.min()

deadout               570.0
cc_syn                 50.0
pesticides              0.2
count_colonies       5600.0
extreme_temp_days       0.0
dtype: float64

In [22]:
model.fit(X_train, y_train)

LinearRegression()

In [23]:
model.score(X_test, y_test)

-2.146772088914075

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
# Create the GridSearchCV model

In [None]:
# Train the model with GridSearch

In [None]:
print(grid2.best_params_)
print(grid2.best_score_)

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)