<a href="https://colab.research.google.com/github/dkurbatovv/Python/blob/main/Avocado_Price_Predict_by_PyCaret.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install pycaret

In [None]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import pycaret
import warnings

from sklearn.preprocessing import LabelEncoder
from scipy import stats
from scipy.stats import *
from pycaret.regression import *

In [None]:
df = pd.read_csv('avocado.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
labels=df['type'].dropna().unique()
labels

In [None]:
plt.figure(figsize=(18,10))

plt.subplot(1,2,1)
plt.title('Avocado Type Percentage', fontweight='bold', fontsize='14',
          fontfamily='sans-serif')
plt.pie(df['type'].value_counts(), labels=labels, 
        wedgeprops=dict(alpha=0.8),  autopct='%.2f%%')


countplt = plt.subplot(1,2,2)
plt.title('Avocado Type Bar Chart', fontweight='bold', fontsize='14', 
          fontfamily='sans-serif')
ax = sns.countplot(x='type', data=df, alpha=0.85)

In [None]:
labels=df['year'].dropna().unique()

plt.figure(figsize=(18,10))

plt.subplot(1,2,1)
plt.title('Avocado Type Percentage', fontweight='bold', fontsize='14',
          fontfamily='sans-serif')
plt.pie(df['year'].value_counts(), labels=labels,
        wedgeprops=dict(alpha=0.8), autopct='%.2f%%')


countplt = plt.subplot(1,2,2)
plt.title('Avocado Type Bar Chart', fontweight='bold', fontsize='14', 
          fontfamily='sans-serif')
ax = sns.countplot(x='year', data=df, alpha=0.85)

In [None]:
df[['AveragePrice', 'Total Volume', '4046', '4225', 
    '4770', 'Total Bags', 'Small Bags', 'Large Bags',
    'XLarge Bags']].describe().T.style.background_gradient(cmap='YlOrBr').set_properties(**{'font-family': 'Segoe UI'})

In [None]:
fig, axs = plt.subplots(4, 2, figsize=(18, 18))

# --- General Title ---
fig.subplots_adjust(top=0.95)
fig.suptitle('Histogram of Continuous Columns', fontweight='bold', 
             fontsize='14', fontfamily='sans-serif')

# --- Histogram of Continuous Variables (Row 1) ---
sns.histplot(data=df, x='AveragePrice', kde=True, ax=axs[0, 0], color='#004D25')
sns.histplot(data=df, x='Total Volume', kde=True, ax=axs[0, 1], color='#EDDE30')

# --- Histogram of Continuous Variables (Row 2) ---
sns.histplot(data=df, x='4046', kde=True, ax=axs[1, 0], color='#48BF53')
sns.histplot(data=df, x='4225', kde=True, ax=axs[1, 1], color='#837A0B')

# --- Histogram of Continuous Variables (Row 3) ---
sns.histplot(data=df, x='4770', kde=True, ax=axs[2, 0], color='#664228')
sns.histplot(data=df, x='Total Bags', kde=True, ax=axs[2, 1], color='#A4E637')

# --- Histogram of Continuous Variables (Row 4) ---
sns.histplot(data=df, x='Small Bags', kde=True, ax=axs[3, 0], color='#BF865D')
sns.histplot(data=df, x='XLarge Bags', kde=True, ax=axs[3, 1], color='#557F0F');

In [None]:
fig, axs = plt.subplots(4, 2, figsize=(18, 18))

# --- General Title ---
fig.subplots_adjust(top=0.95)
fig.suptitle('Histogram of Continuous Columns', fontweight='bold', 
             fontsize='14', fontfamily='sans-serif')

# --- Histogram of Continuous Variables (Row 1) ---
sns.boxplot(data=df, x='AveragePrice', ax=axs[0, 0], color='#004D25')
sns.boxplot(data=df, x='Total Volume', ax=axs[0, 1], color='#EDDE30')

# --- Histogram of Continuous Variables (Row 2) ---
sns.boxplot(data=df, x='4046', ax=axs[1, 0], color='#48BF53')
sns.boxplot(data=df, x='4225', ax=axs[1, 1], color='#837A0B')

# --- Histogram of Continuous Variables (Row 3) ---
sns.boxplot(data=df, x='4770', ax=axs[2, 0], color='#664228')
sns.boxplot(data=df, x='Total Bags', ax=axs[2, 1], color='#A4E637')

# --- Histogram of Continuous Variables (Row 4) ---
sns.boxplot(data=df, x='Small Bags', ax=axs[3, 0], color='#BF865D')
sns.boxplot(data=df, x='XLarge Bags', ax=axs[3, 1], color='#557F0F');

In [None]:
# --- Creating Box Plot based on Type ---
fig, ax = plt.subplots()
fig.set_size_inches(17, 7)
plt.title('Average Price Distribution based on Types from 2015-2018', fontweight='bold', 
          fontsize='14', fontfamily='sans-serif')
sns.boxplot(x='year', y='AveragePrice', hue='type', data=df, ax=ax,
            boxprops=dict(alpha=0.9), linewidth=1.5)
plt.xlabel('Year', fontweight='bold', fontsize='11', fontfamily='sans-serif')
plt.ylabel('Average Price', fontweight='bold', fontsize='11', 
           fontfamily='sans-serif')
plt.xticks(fontsize='8')
plt.yticks(fontsize='8')
plt.legend(labels=['Conventional','Organic'], title='$\\bf{Type}$', fontsize='8', 
           title_fontsize='9', loc='upper right', frameon=True)
plt.grid(axis='y', alpha=0.4)
plt.show()

In [None]:
x0=df.xs('Total Volume', axis=1)[df.type=='conventional']
x1=df.xs('Total Volume', axis=1)[df.type=='organic']
y0=df.xs('Total Bags', axis=1)[df.type=='conventional']
y1=df.xs('Total Bags', axis=1)[df.type=='organic']

# --- Creating Scatter Plot ---
plt.figure(figsize=(10,8))
plt.title('Scatter Plot between Avg. Price and Total Volume', fontweight='bold', 
          fontsize='14', fontfamily='sans-serif')
plt.scatter(x=x0, y=y0, alpha=0.6, linewidths=1)
plt.scatter(x=x1, y=y1, alpha=0.6, linewidths=1)
plt.legend(['Conventional', 'Organic'], title='$\\bf{Type}$', fontsize='7', 
           title_fontsize='8', loc='upper left', frameon=True)
plt.xlabel('Total Volume', fontweight='bold', fontsize='11',
           fontfamily='sans-serif')
plt.ylabel('Total Bags', fontweight='bold', fontsize='11', 
           fontfamily='sans-serif')
plt.ticklabel_format(style='plain', axis='both')
plt.grid(axis='both', alpha=0.5, lw=0.5)
plt.show()

In [None]:
df.Date = pd.to_datetime(df.Date)

# --- Extracting Month Number from `Date`  ---
df['month'] = pd.DatetimeIndex(df['Date']).month

In [None]:
df.head()

In [None]:
avc = setup(data = df, target = 'AveragePrice', train_size = 0.8, 
            categorical_features = ['type', 'year', 'region', 'month'],
            normalize = True, normalize_method = 'robust', silent = True,
            ignore_low_variance = True, session_id = 123)

In [None]:
models()

In [None]:
best_models = compare_models(sort='R2')

In [None]:
plot_model(best_models)

In [None]:
plot_model(best_models, plot = 'error')

In [None]:
plot_model(best_models, plot = 'feature')

In [None]:
rf = create_model('rf')

In [None]:
plot_model(rf)

In [None]:
plot_model(rf, plot = 'error')

In [None]:
plot_model(rf, plot = 'feature')

In [None]:
predict_model(rf)

In [None]:
final_best = finalize_model(rf)

# --- Final Best Model Parameters for Deployment ---
plot_model(rf, plot='parameter')