### Starting with Linear Regression in sckit-learn, then comparing models with Random Forest Regressor and Neural Network ML model.

(Features) **X-variables**: region, date, units sold, production data, climate data.

(Target) **y-variable:** avocado price 

### Importing data from the database

In [1]:
#Code here

## Linear Regression with sckit-learn

In [3]:
# Import dependencies.
import pandas as pd
import numpy
from collections import Counter
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load the avocado.csv dataset.
avocado_df = pd.read_csv(Path('./Data/prices_prod.csv'))
# Make a copy
avocado_df = avocado_df.copy()
avocado_df.head()

Unnamed: 0,year_month,date,geography,type,avg_price,price_total_volume,four_zero_four_six_units,four_two_two_five_units,four_seven_seven_zero_units,total_bags,...,status,prod_total_volume,ratio price_total_volume vs prod_total_volume,california,ratio price_total_volume vs california,prod,chile,mexico,peru,columbia
0,1/1/2018,1/14/2018,Albany,conventional,1.42,95246.38,2897.41,76570.67,44.0,15734.3,...,actual,52758905,0.001805,830821,0.114641277,"(2018-01-01,1/14/2018,actual,52758905,830821,4...",449003,51479081,0,0
1,1/1/2018,1/14/2018,Albany,organic,1.47,4140.95,7.3,301.87,0.0,3831.78,...,actual,52758905,7.8e-05,830821,0.004984166,"(2018-01-01,1/14/2018,actual,52758905,830821,4...",449003,51479081,0,0
2,1/1/2018,1/21/2018,Albany,conventional,1.69,135196.35,3133.37,116520.88,88.78,15453.32,...,actual,54698266,0.002472,2913663,0.046400819,"(2018-01-01,1/21/2018,actual,54698266,2913663,...",445127,51339476,0,0
3,1/1/2018,1/21/2018,Albany,organic,1.54,3346.54,14.67,253.01,0.0,3078.86,...,actual,54698266,6.1e-05,2913663,0.001148568,"(2018-01-01,1/21/2018,actual,54698266,2913663,...",445127,51339476,0,0
4,1/1/2018,1/28/2018,Albany,conventional,1.57,93625.03,3101.17,74627.23,55.59,15841.04,...,actual,45299201,0.002067,2698356,0.034697064,"(2018-01-01,1/28/2018,actual,45299201,2698356,...",95491,42505354,0,0


In [4]:
# Drop the non-beneficial columns.
avocado_df.drop(columns=["year_month", "prod", "status"], inplace=True)
avocado_df.head()

Unnamed: 0,date,geography,type,avg_price,price_total_volume,four_zero_four_six_units,four_two_two_five_units,four_seven_seven_zero_units,total_bags,s_bags,l_bags,xl_bags,prod_total_volume,ratio price_total_volume vs prod_total_volume,california,ratio price_total_volume vs california,chile,mexico,peru,columbia
0,1/14/2018,Albany,conventional,1.42,95246.38,2897.41,76570.67,44.0,15734.3,10012.8,5721.5,0.0,52758905,0.001805,830821,0.114641277,449003,51479081,0,0
1,1/14/2018,Albany,organic,1.47,4140.95,7.3,301.87,0.0,3831.78,3831.78,0.0,0.0,52758905,7.8e-05,830821,0.004984166,449003,51479081,0,0
2,1/21/2018,Albany,conventional,1.69,135196.35,3133.37,116520.88,88.78,15453.32,10023.79,5429.53,0.0,54698266,0.002472,2913663,0.046400819,445127,51339476,0,0
3,1/21/2018,Albany,organic,1.54,3346.54,14.67,253.01,0.0,3078.86,3078.86,0.0,0.0,54698266,6.1e-05,2913663,0.001148568,445127,51339476,0,0
4,1/28/2018,Albany,conventional,1.57,93625.03,3101.17,74627.23,55.59,15841.04,11614.79,4159.58,66.67,45299201,0.002067,2698356,0.034697064,95491,42505354,0,0


In [5]:
# Check datatypes
avocado_df.dtypes

date                                              object
geography                                         object
type                                              object
avg_price                                        float64
price_total_volume                               float64
four_zero_four_six_units                         float64
four_two_two_five_units                          float64
four_seven_seven_zero_units                      float64
total_bags                                       float64
s_bags                                           float64
l_bags                                           float64
xl_bags                                          float64
prod_total_volume                                  int64
ratio price_total_volume vs prod_total_volume    float64
california                                         int64
ratio price_total_volume vs california            object
chile                                              int64
mexico                         

In [6]:
# Convert date object to date and check datatype
import datetime
avocado_df.date = avocado_df.date.apply(pd.to_datetime)
print(avocado_df.date.dtypes)

datetime64[ns]


In [7]:
# Exctract the month
avocado_df['month'] = avocado_df.date.dt.month
avocado_df.head()

Unnamed: 0,date,geography,type,avg_price,price_total_volume,four_zero_four_six_units,four_two_two_five_units,four_seven_seven_zero_units,total_bags,s_bags,...,xl_bags,prod_total_volume,ratio price_total_volume vs prod_total_volume,california,ratio price_total_volume vs california,chile,mexico,peru,columbia,month
0,2018-01-14,Albany,conventional,1.42,95246.38,2897.41,76570.67,44.0,15734.3,10012.8,...,0.0,52758905,0.001805,830821,0.114641277,449003,51479081,0,0,1
1,2018-01-14,Albany,organic,1.47,4140.95,7.3,301.87,0.0,3831.78,3831.78,...,0.0,52758905,7.8e-05,830821,0.004984166,449003,51479081,0,0,1
2,2018-01-21,Albany,conventional,1.69,135196.35,3133.37,116520.88,88.78,15453.32,10023.79,...,0.0,54698266,0.002472,2913663,0.046400819,445127,51339476,0,0,1
3,2018-01-21,Albany,organic,1.54,3346.54,14.67,253.01,0.0,3078.86,3078.86,...,0.0,54698266,6.1e-05,2913663,0.001148568,445127,51339476,0,0,1
4,2018-01-28,Albany,conventional,1.57,93625.03,3101.17,74627.23,55.59,15841.04,11614.79,...,66.67,45299201,0.002067,2698356,0.034697064,95491,42505354,0,0,1


In [8]:
# Check the datatypes for month
print(avocado_df.month.dtypes)

int64


In [9]:
#Check the months
#avocado_df.month.value_counts()

In [10]:
# Plot the data with encoded date grouped by month. - check also for scatter plot
#avocado_df.groupby('month')['ratio CA sold vs prod'].mean().plot()

In [11]:
# Use get_dummies() to create variables for text features.
avocado_cat_encoded = pd.get_dummies(avocado_df,  columns=["geography", "type"])
avocado_cat_encoded.head()

Unnamed: 0,date,avg_price,price_total_volume,four_zero_four_six_units,four_two_two_five_units,four_seven_seven_zero_units,total_bags,s_bags,l_bags,xl_bags,...,geography_Spokane,geography_St. Louis,geography_Syracuse,geography_Tampa,geography_Total U.S.,geography_West,geography_West Tex/New Mexico,type_conventional,type_conventional.1,type_organic
0,2018-01-14,1.42,95246.38,2897.41,76570.67,44.0,15734.3,10012.8,5721.5,0.0,...,0,0,0,0,0,0,0,1,0,0
1,2018-01-14,1.47,4140.95,7.3,301.87,0.0,3831.78,3831.78,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,2018-01-21,1.69,135196.35,3133.37,116520.88,88.78,15453.32,10023.79,5429.53,0.0,...,0,0,0,0,0,0,0,1,0,0
3,2018-01-21,1.54,3346.54,14.67,253.01,0.0,3078.86,3078.86,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,2018-01-28,1.57,93625.03,3101.17,74627.23,55.59,15841.04,11614.79,4159.58,66.67,...,0,0,0,0,0,0,0,1,0,0


In [12]:
#List the columns
avocado_cat_encoded.columns.tolist()

['date',
 'avg_price',
 'price_total_volume',
 'four_zero_four_six_units',
 'four_two_two_five_units',
 'four_seven_seven_zero_units',
 'total_bags',
 's_bags',
 'l_bags',
 'xl_bags',
 'prod_total_volume',
 'ratio price_total_volume vs prod_total_volume',
 'california',
 'ratio price_total_volume vs california',
 'chile',
 'mexico',
 'peru',
 'columbia',
 'month',
 'geography_Albany',
 'geography_Atlanta',
 'geography_Baltimore/Washington',
 'geography_Boise',
 'geography_Boston',
 'geography_Buffalo/Rochester',
 'geography_California',
 'geography_Charlotte',
 'geography_Chicago',
 'geography_Cincinnati/Dayton',
 'geography_Columbus',
 'geography_Dallas/Ft. Worth',
 'geography_Denver',
 'geography_Detroit',
 'geography_Grand Rapids',
 'geography_Great Lakes',
 'geography_Harrisburg/Scranton',
 'geography_Hartford/Springfield',
 'geography_Houston',
 'geography_Indianapolis',
 'geography_Jacksonville',
 'geography_Las Vegas',
 'geography_Los Angeles',
 'geography_Louisville',
 'geograp

In [13]:
# Drop the non-beneficial columns - drop "prod_total_volume" after the ratio calculations
avocado_cat_encoded.drop(columns=["date", "prod_total_volume"], inplace=True)
avocado_cat_encoded.head()

Unnamed: 0,avg_price,price_total_volume,four_zero_four_six_units,four_two_two_five_units,four_seven_seven_zero_units,total_bags,s_bags,l_bags,xl_bags,ratio price_total_volume vs prod_total_volume,...,geography_Spokane,geography_St. Louis,geography_Syracuse,geography_Tampa,geography_Total U.S.,geography_West,geography_West Tex/New Mexico,type_conventional,type_conventional.1,type_organic
0,1.42,95246.38,2897.41,76570.67,44.0,15734.3,10012.8,5721.5,0.0,0.001805,...,0,0,0,0,0,0,0,1,0,0
1,1.47,4140.95,7.3,301.87,0.0,3831.78,3831.78,0.0,0.0,7.8e-05,...,0,0,0,0,0,0,0,0,0,1
2,1.69,135196.35,3133.37,116520.88,88.78,15453.32,10023.79,5429.53,0.0,0.002472,...,0,0,0,0,0,0,0,1,0,0
3,1.54,3346.54,14.67,253.01,0.0,3078.86,3078.86,0.0,0.0,6.1e-05,...,0,0,0,0,0,0,0,0,0,1
4,1.57,93625.03,3101.17,74627.23,55.59,15841.04,11614.79,4159.58,66.67,0.002067,...,0,0,0,0,0,0,0,1,0,0


In [14]:
# Define X values.
X = avocado_cat_encoded[['ratio price_total_volume vs prod_total_volume', 'month',
 'geography_Albany',
 'geography_Atlanta',
 'geography_Baltimore/Washington',
 'geography_Boise',
 'geography_Boston',
 'geography_Buffalo/Rochester',
 'geography_California',
 'geography_Charlotte',
 'geography_Chicago',
 'geography_Cincinnati/Dayton',
 'geography_Columbus',
 'geography_Dallas/Ft. Worth',
 'geography_Denver',
 'geography_Detroit',
 'geography_Grand Rapids',
 'geography_Great Lakes',
 'geography_Harrisburg/Scranton',
 'geography_Hartford/Springfield',
 'geography_Houston',
 'geography_Indianapolis',
 'geography_Jacksonville',
 'geography_Las Vegas',
 'geography_Los Angeles',
 'geography_Louisville',
 'geography_Miami/Ft. Lauderdale',
 'geography_Midsouth',
 'geography_Nashville',
 'geography_New Orleans/Mobile',
 'geography_New York',
 'geography_Northeast',
 'geography_Northern New England',
 'geography_Orlando',
 'geography_Philadelphia',
 'geography_Phoenix/Tucson',
 'geography_Pittsburgh',
 'geography_Plains',
 'geography_Portland',
 'geography_Raleigh/Greensboro',
 'geography_Richmond/Norfolk',
 'geography_Roanoke',
 'geography_Sacramento',
 'geography_San Diego',
 'geography_San Francisco',
 'geography_Seattle',
 'geography_South Carolina',
 'geography_South Central',
 'geography_Southeast',
 'geography_Spokane',
 'geography_St. Louis',
 'geography_Syracuse',
 'geography_Tampa',
 'geography_Total U.S.',
 'geography_West',
 'geography_West Tex/New Mexico',
 'type_conventional',
 'type_conventional ',
 'type_organic']]

In [15]:
# Define y values.
y = avocado_df.avg_price

In [16]:
# Split the to training and testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [17]:
# Fit the model.
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [18]:
# Predict the model.
y_pred = model.predict(X_test)

In [19]:
print(model.coef_)
print(model.intercept_)

[-1.19008847e-01  8.55748309e-03  1.25526147e+10  1.25526147e+10
  1.25526147e+10  1.25526147e+10  1.25526147e+10  1.25526147e+10
  1.25526147e+10  1.25526147e+10  1.25526147e+10  1.25526147e+10
  1.25526147e+10  1.25526147e+10  1.25526147e+10  1.25526147e+10
  1.25526147e+10  1.25526147e+10  1.25526147e+10  1.25526147e+10
  1.25526147e+10  1.25526147e+10  1.25526147e+10  1.25526147e+10
  1.25526147e+10  1.25526147e+10  1.25526147e+10  1.25526147e+10
  1.25526147e+10  1.25526147e+10  1.25526147e+10  1.25526147e+10
  1.25526147e+10  1.25526147e+10  1.25526147e+10  1.25526147e+10
  1.25526147e+10  1.25526147e+10  1.25526147e+10  1.25526147e+10
  1.25526147e+10  1.25526147e+10  1.25526147e+10  1.25526147e+10
  1.25526147e+10  1.25526147e+10  1.25526147e+10  1.25526147e+10
  1.25526147e+10  1.25526147e+10  1.25526147e+10  1.25526147e+10
  1.25526147e+10  1.25526147e+10  1.25526147e+10  1.25526147e+10
  6.97492144e+10  6.97492144e+10  6.97492144e+10]
-82301829137.59505


In [20]:
# Print the scores
print("R2 score : %.2f" % r2_score(y_test, y_pred))
print("Mean squared error: %.2f" % mean_squared_error(y_test,y_pred))

R2 score : 0.62
Mean squared error: 0.04


In [21]:
#pd.Series(y_pred)

In [21]:
#pd.Series(y_pred).plot()