In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

FILE="avocado.csv"

df = pd.read_csv(FILE)
df = df.drop('region', axis=1)
df = df.drop('Date', axis=1)
df = df.drop('Unnamed: 0', axis=1)
print(df.head(5))

   AveragePrice  Total Volume     4046       4225    4770  Total Bags  \
0          1.33      64236.62  1036.74   54454.85   48.16     8696.87   
1          1.35      54876.98   674.28   44638.81   58.33     9505.56   
2          0.93     118220.22   794.70  109149.67  130.50     8145.35   
3          1.08      78992.15  1132.00   71976.41   72.58     5811.16   
4          1.28      51039.60   941.48   43838.39   75.78     6183.95   

   Small Bags  Large Bags  XLarge Bags          type  year  
0     8603.62       93.25          0.0  conventional  2015  
1     9408.07       97.49          0.0  conventional  2015  
2     8042.21      103.14          0.0  conventional  2015  
3     5677.40      133.76          0.0  conventional  2015  
4     5986.26      197.69          0.0  conventional  2015  


In [2]:
print(df.isna().sum())

AveragePrice    0
Total Volume    0
4046            0
4225            0
4770            0
Total Bags      0
Small Bags      0
Large Bags      0
XLarge Bags     0
type            0
year            0
dtype: int64


In [3]:
print(df['type'].value_counts())

type
conventional    9126
organic         9123
Name: count, dtype: int64


In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_cols = df.columns.to_list()
scaled_cols.remove('type')
df_scaled = pd.DataFrame(scaler.fit_transform(df[scaled_cols].to_numpy()), columns=scaled_cols)
df_scaled['type'] = df['type'].apply(lambda x: 1 if (x=='organic') else 0)
print(df_scaled.head(5))

   AveragePrice  Total Volume      4046      4225      4770  Total Bags  \
0     -0.188689     -0.227716 -0.230816 -0.199902 -0.212091   -0.234170   
1     -0.139020     -0.230427 -0.231103 -0.208054 -0.211997   -0.233350   
2     -1.182069     -0.212085 -0.231007 -0.154478 -0.211325   -0.234730   
3     -0.809551     -0.223444 -0.230741 -0.185350 -0.211864   -0.237096   
4     -0.312861     -0.231538 -0.230891 -0.208719 -0.211834   -0.236718   

   Small Bags  Large Bags  XLarge Bags      year  type  
0   -0.232647   -0.222352     -0.17558 -1.221282     0  
1   -0.231568   -0.222335     -0.17558 -1.221282     0  
2   -0.233399   -0.222311     -0.17558 -1.221282     0  
3   -0.236568   -0.222186     -0.17558 -1.221282     0  
4   -0.236154   -0.221924     -0.17558 -1.221282     0  


In [19]:
from sklearn.model_selection import train_test_split
df_train, df_valid_test = train_test_split(df_scaled, test_size=0.2, random_state=42)
df_valid, df_test = train_test_split(df_valid_test, test_size=0.5, random_state=42)

target = 'AveragePrice'
y_train, y_valid, y_test = df_train[target], df_valid[target], df_test[target]
X_train, X_valid, X_test = df_train.drop(target, axis=1), df_valid.drop(target, axis=1), df_test.drop(target, axis=1)

In [20]:
from sklearn.neighbors import KNeighborsRegressor

best_score = 0
best_k = 0
for k in range(2,16):
    knr = KNeighborsRegressor(n_neighbors=k).fit(X_train, y_train)
    score = knr.score(X_valid, y_valid)
    if score > best_score:
        best_score, best_k = score, k

print("VALIDATION:The best coefficient of determination is %f with neighbors = %d"%(best_score, best_k))
knr = KNeighborsRegressor(n_neighbors=best_k).fit(X_train, y_train)
test_score = knr.score(X_test, y_test)
print("TESTING: The test coefficient of determination is %f with neighbors = %d"%(test_score, best_k))

VALIDATION:The best coefficient of determination is 0.756045 with neighbors = 4
TESTING: The test coefficient of determination is 0.716717 with neighbors = 4


In [None]:
from sklearn.ensemble import RandomForestRegressor

best_score = 0
best_n = 0
best_depth = 0
for n in range(15,22):
    for d in range(16,20):
        rfr = RandomForestRegressor(n_estimators=n, max_depth=d, random_state=42).fit(X_train, y_train)
        score = rfr.score(X_valid, y_valid)
        if score > best_score:
            best_score, best_n, best_depth = score, n, d

print("VALIDATION: The best coefficient of determination is %f with n_estimators = %d and max_depth = %d"%(best_score, best_n, best_depth))
rfr = RandomForestRegressor(n_estimators=best_n, max_depth=best_depth, random_state=0).fit(X_test, y_test)
test_score = rfr.score(X_test, y_test)
print("TESTING: The test coefficient of determination is %f with n_estimators = %d and max_depth = %d"%(test_score, best_n, best_depth))

''' The results for random forest look great, but there's a big danger of overfitting in the validation phase. The test results look amazing but
that appears to be coincidence.
'''

VALIDATION: The best coefficient of determination is 0.804576 with n_estimators = 18 and max_depth = 19
TESTING: The test coefficient of determination is 0.938085 with n_estimators = 18 and max_depth = 19


" The results for random forest look great, but there's a big danger of overfitting in the validation phase. The test results look amazing but\nthat appears to be coincidence. Rerunning everything with a different random state should give different results.\n"