In [None]:
#import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import calendar
import scipy.stats as st
from scipy.stats import linregress

In [None]:
#read initial csv into a dataframe
csv_file = 'Main Avocado Tree (Resources)/avocado-updated-2020.csv'
avocado_df = pd.read_csv(csv_file)
avocado_df


In [None]:
#read income csv into a dataframe
csv_income = 'Main Avocado Tree (Resources)/income per metropolitan area.csv'
income_df = pd.read_csv(csv_income)
income_df.head()

In [None]:
#Rename columns to match those in Avocado
rename_income_df = income_df.rename(columns={"Geography": "geography", "Year":"year"})
rename_income_df.head()

In [None]:
#convert date into three columns
split_cols = avocado_df['date'].str.split('-',expand=True)
split_cols.columns = [f'Split-{i}' for i in range(3)]
avocado_df = avocado_df.join(split_cols)
#rename splits to day, month
avocado_df = avocado_df.rename(columns={'Split-1':'month','Split-2':'day'})
#drop Split-0 due to redundancy with year column
avocado_df = avocado_df.drop(['Split-0'], axis=1)
#better geography match
#used https://hassavocadoboard.com/category-data to confirm state match to income data
#Drop Cities with same name, wrong state from income data
#Drop all Springfields except Springfield MA from income data
rename_income_df = rename_income_df.drop(rename_income_df[(rename_income_df['geography'] == 'Springfield') & (rename_income_df['State'] == 'MO')].index)
rename_income_df = rename_income_df.drop(rename_income_df[(rename_income_df['geography'] == 'Springfield') & (rename_income_df['State'] == 'IL')].index)
rename_income_df = rename_income_df.drop(rename_income_df[(rename_income_df['geography'] == 'Springfield') & (rename_income_df['State'] == 'OH')].index)
rename_income_df = rename_income_df.drop(rename_income_df[(rename_income_df['geography'] == 'Eugene-Springfield') & (rename_income_df['State'] == ' OR')].index)
#Drop Albanies not NY from income data
rename_income_df = rename_income_df.drop(rename_income_df[(rename_income_df['geography'] == 'Albany-Lebanon') & (rename_income_df['State'] == ' OR')].index)
rename_income_df = rename_income_df.drop(rename_income_df[(rename_income_df['geography'] == 'Albany') & (rename_income_df['State'] == ' GA')].index)
#Drop all Columbuses not OH from income data
rename_income_df = rename_income_df.drop(rename_income_df[(rename_income_df['geography'] == 'Columbus') & (rename_income_df['State'] == ' GA-AL')].index)
rename_income_df = rename_income_df.drop(rename_income_df[(rename_income_df['geography'] == 'Columbus') & (rename_income_df['State'] == ' IN')].index)
#Drop all Jacksonvilles not FL
rename_income_df = rename_income_df.drop(rename_income_df[(rename_income_df['geography'] == 'Jacksonville') & (rename_income_df['State'] == ' NC')].index)
#Drop all Portlands not OR
rename_income_df = rename_income_df.drop(rename_income_df[(rename_income_df['geography'] == 'Portland-South Portland') & (rename_income_df['State'] == ' ME')].index)
#Drop all Rochesters not NY
rename_income_df = rename_income_df.drop(rename_income_df[(rename_income_df['geography'] == 'Rochester') & (rename_income_df['State'] == ' MN')].index)

#text to column cities in income and avocado data
columnTOtext1 = rename_income_df['geography'].str.split('-',expand=True)
columnTOtext1.columns = [f'geography-{i}' for i in range(4)]
new_income = rename_income_df.join(columnTOtext1)
new_income = new_income.drop(['geography'], axis=1)
columnTOtext = avocado_df['geography'].str.split('/',expand=True)
columnTOtext.columns = [f'geography-{i}' for i in range(2)]
new_avocado = avocado_df.join(columnTOtext)
new_avocado = new_avocado.drop(['geography'], axis=1)

#keep rows that only have one city
second_cities = new_income['geography-1'].unique()
second_cities = second_cities.tolist()
del second_cities[0]

new_income_first = new_income[~new_income['geography-1'].isin(second_cities)]
new_income_first = new_income_first.drop(['geography-1', 'geography-2', 'geography-3'], axis=1)
#merge data frame based on new_income_first
first = new_avocado.merge(new_income_first, how='inner', on=['geography-0', 'year'])
first_geo_0 = first['geography-0'].unique()
first_geo_0 = first_geo_0.tolist()
first_geo_1 = first['geography-1'].unique()
#remove already merged cities from new_avocado
new_avocado = new_avocado[~new_avocado['geography-0'].isin(first_geo_0)]
#keep rows that only have a two cities
new_income_second = new_income.dropna(subset=['geography-1'])
new_avocado_second = new_avocado.dropna(subset=['geography-1'])
third_cities = new_income_second['geography-2'].unique()
third_cities = third_cities.tolist()
del third_cities[2]

new_income_second = new_income_second[~new_income_second['geography-2'].isin(third_cities)]
new_income_second = new_income_second.drop(['geography-2', 'geography-3'], axis=1)

#merge data frame based on new_income_second
second = new_avocado.merge(new_income_second, how='inner', on=['geography-0', 'year'])
second = second.drop(['geography-1_y'], axis=1)
second = second.rename(columns={'geography-1_x':'geography-1'})
second_geo_0 = second['geography-0'].unique()
second_geo_0 = second_geo_0.tolist()
#remove already merged cities
new_avocado = new_avocado[~new_avocado['geography-0'].isin(second_geo_0)]
#keep rows that only have a three cities
new_income_third = new_income.dropna(subset=['geography-2'])
fourth_cities = new_income_third['geography-3'].unique()
fourth_cities = fourth_cities.tolist()
del fourth_cities[0]

new_income_third = new_income_third[~new_income_third['geography-3'].isin(fourth_cities)]
new_income_third = new_income_third.drop(['geography-3'], axis=1)

#merge dataframes
third = new_avocado.merge(new_income_third, how='inner', on=['geography-0', 'year'])
third = third.drop(['geography-1_y', 'geography-2'], axis=1)
third = third.rename(columns={'geography-1_x':'geography-1'})
third_geo_0 = third['geography-0'].unique()
third_geo_0 = third_geo_0.tolist()
#remove already merged cities
new_avocado = new_avocado[~new_avocado['geography-0'].isin(third_geo_0)]
#keep rows that only have a four cities
new_income_fourth = new_income.dropna(subset=['geography-3'])

#merge dataframes
fourth = new_avocado.merge(new_income_fourth, how='inner', on=['geography-0', 'year'])
fourth = fourth.drop(['geography-1_y', 'geography-2', 'geography-3'], axis=1)
fourth = fourth.rename(columns={'geography-1_x':'geography-1'})

#combine merged dataframes into one
income_avocado = first.append(second)
income_avocado = income_avocado.append(third)
income_avocado = income_avocado.append(fourth)

#get list of cities in dataframe
unique_cities = income_avocado['geography-0'].unique()
unique_cities = unique_cities.tolist()
income_avocado.reset_index()
income_avocado






In [None]:
grouped_by_month_year = pd.DataFrame(income_avocado.groupby(["year", "month"]).mean().round(2))
grouped_by_month_year

In [None]:
grouped_year_city = pd.DataFrame(income_avocado.groupby(["geography-0", "year"]).mean().round(2))
grouped_year_city

In [None]:
# Question 1: Which cities love avocados most? Arianne
grouped_year_city['total_volume'].unstack().plot(kind='bar', color=('lightgreen', 'mediumspringgreen', 'mediumseagreen'))
plt.xlabel("City")
plt.xticks(rotation=90)
plt.ylabel("Volume Purchased")
plt.title("Cities Who Love Avocados by Volume purchased")
plt.savefig("Output/Cities_who_love_Avocados.png")
plt.show()
#Task 1: popularity of avocado bar graph of volume purchased per city 

In [None]:
Bad_year= [2017, 2018]
Only_2019 = income_avocado[~income_avocado['year'].isin(Bad_year)]
Only_2019

In [None]:
Only_2019 = Only_2019.rename(columns={"geography-0":"city"})

In [None]:
essential_2019 = Only_2019[['city', 'total_volume']]
essential_2019

In [None]:
essential_2019.groupby(['city']).sum().plot(kind='bar', color='forestgreen')
plt.xlabel("City")
plt.xticks(rotation=90)
plt.ylabel("Volume Purchased")
plt.title("Cities Who Love Avocados by Volume purchased 2019")
plt.savefig("Output/2019_loves_only.png")
plt.show()

In [None]:
#read initial csv into a dataframe
csv_file2 = 'Main Avocado Tree (Resources)/worldcities.csv'
pop_df = pd.read_csv(csv_file2)
pop_df

In [None]:
#Merge the two dataframes together on Geography and year

avocado_pop = pd.merge(Only_2019, pop_df, on=['city'], how='inner')
avocado_pop

In [None]:
cleaned_avocado_pop = avocado_pop[['month', 'year', 'city', 'population', 'average_price', 'total_volume', '4046', '4225', '4770', 'type']]
cleaned_avocado_pop

In [None]:
#first look at population sizes:
essential_population = cleaned_avocado_pop[["city", "population"]]
essential_population.groupby(['city']).sum().plot(kind='bar', color='limegreen')
plt.xlabel("City")
plt.xticks(rotation=90)
plt.ylabel("Volume Purchased")
plt.title("2019 Population for Avocado loving-Cities")
plt.savefig("Output/2019_Population.png")
plt.show()

In [None]:
unique = essential_2019['city'].unique()
unique

In [None]:
#Side by side comparison
ax = essential_2019.groupby(['city']).sum().plot(color='forestgreen')
essential_population.groupby(['city']).sum().plot(color='limegreen', ax=ax)
plt.xlabel("City")
plt.xticks(np.arange(len(unique)), unique, rotation=90)
plt.ylabel("Volume")
plt.title("Population Comparison VS Volume of Avocados Sold")
plt.savefig("Output/2019_comparison.png")

plt.show()

In [None]:
#Question 2: Do states with higher income per capita buy more avocados? (income per capita from 2017-2019) Erica
income_avocado_g = income_avocado.groupby(['geography-0', 'year'])
purchased = income_avocado_g['total_volume'].mean()
income = income_avocado_g['Per capita personal income'].mean()

x_values = income
y_values = purchased
(slope, intercept, rvalue, pvalue, stderr) = st.mstats.linregress(x_values, y_values)
regress_values = x_values*slope + intercept
line_eq = "y = " +str(round(slope,2)) + "x + " + str(round(intercept, 2))
fig, ax1 = plt.subplots()
ax1.scatter(x_values, y_values, color='green')
ax1.plot(x_values, regress_values, color='brown')
ax1.annotate(line_eq,(70000,3000000),fontsize=15,color="darkgreen")
ax1.ticklabel_format(useOffset=False, style='plain')
ax1.set_xlabel('Income Per Capita')
ax1.set_ylabel('Avocados Purchased')


fig.set_size_inches(7, 6)
plt.show()
fig.savefig('./Output/IncomevsVolume.png')
correlation = st.mstats.pearsonr(income, purchased)
print(f"The correlation between both factors is {round(correlation[0], 2)}")


#Task 2: line graph (x-values states, y-values income per capita and avocado revenue) 

In [None]:
#Question 3: Which type of avocado is the biggest source of revenue for suppliers? Erica

#create revenue column
income_avocado['revenue'] = round(income_avocado['average_price']*income_avocado['total_volume'], 2)

#list of unique years in data
years = income_avocado['year'].unique()
years = years.tolist()
years
#list of unique types in data
types = income_avocado['type'].unique()
types = types.tolist()
#seperate into dataframes for each year
typeANDrevenue_2017 = income_avocado.loc[income_avocado['year']==2017, ['type', 'revenue']]
typeANDrevenue_2018 = income_avocado.loc[income_avocado['year']==2018, ['type', 'revenue']]
typeANDrevenue_2019 = income_avocado.loc[income_avocado['year']==2019, ['type', 'revenue']]
#grouping by type
grouped_2017 = typeANDrevenue_2017.groupby('type')
grouped_2018 = typeANDrevenue_2018.groupby('type')
grouped_2019 = typeANDrevenue_2019.groupby('type')
#sum of revenues
revenues_2017 = grouped_2017.sum()
revenues_2018 = grouped_2018.sum()
revenues_2019 = grouped_2019.sum()
#Task 3: Three pie charts by type (value = revenue) for each year on one png 
colors = ['green', 'lightgreen']
explode = explode = (0.1,0)
#create subplots
figure, (ax1, ax2, ax3) = plt.subplots(1,3,subplot_kw={'aspect':'equal'})
ax1.pie(revenues_2017['revenue'], explode=(0.1,0), labels=types, colors=colors, autopct="%1.1f%%", shadow=True, startangle=140)
ax2.pie(revenues_2018['revenue'], explode=(0.1,0), labels=types, colors=colors, autopct="%1.1f%%", shadow=True, startangle=140)
ax3.pie(revenues_2019['revenue'], explode=(0.1,0), labels=types, colors=colors, autopct="%1.1f%%", shadow=True, startangle=140)
#format
ax1.set_xlabel('2017')
ax2.set_xlabel('2018')
ax3.set_xlabel('2019')
ax2.set_title("Proportions of Revenue by Avocado Type for Each Year")

plt.show()
fig.savefig('./Output/RevenueByTypeByYear.png')

In [None]:
#Question 4: Is there a time of year that sees a spike in revenue? Erica
#create a list of months for setting the month column as categorical type
#'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
#use calendar import to map month names to corresponding integer 
income_avocado.sort_values(by="date")
income_avocado.reset_index()
income_avocado['month'] = pd.to_numeric(income_avocado['month'])
income_avocado['day'] = pd.to_numeric(income_avocado['day'])
income_avocado['month'] = income_avocado["month"].apply(lambda x: calendar.month_abbr[x])
months = income_avocado['month'].unique()
months = months.tolist()
#make categorical, so column can be sorted by index of months list
income_avocado['month'] = pd.Categorical(income_avocado['month'], categories=months)

#pivot data, so there's a column for each month, each row has year and sum of revenue column
pd.options.display.float_format = '{:, .2f}'.format
income_avocado_pivot = pd.pivot_table(income_avocado, values='revenue', index='year', columns='month', aggfunc=np.sum)

#plot a bar chart using the pivoted data
ax1 = income_avocado_pivot.plot(kind="bar")
#get a Matplotlib figure from the axes object for formatting purposes
fig = ax1.get_figure()
#change the plot dimensions (width, height)
fig.set_size_inches(7, 6)
#change the axes labels
#ax1.ticklabel_format(useOffset=False, style='plain')
ax1.set_xlabel("Years")
ax1.set_ylabel("Total Revenues Per Month ($100 Millions)")
ax1.legend(fancybox=True, bbox_to_anchor=(1.05, 1.0), loc='upper left')
ax1.grid(True)
# Use this to show the plot in a new window
plt.show()
# Export the plot as a PNG file
fig.savefig('./Output/RevenueByMonthByYear.png')
#Task 4: bar chart (x-values months, y-values avocado revenue)

In [None]:
#Question 5: How has the average avocado price changed over time?  What is the predicted avocado price for 2021 based on our data? Bitty
#group data by year and create year specific dataframes
data_2017 = income_avocado[income_avocado["year"] == 2017]
month_2017 = pd.DataFrame(data_2017)
month_2017 = pd.DataFrame(data_2017.groupby(["month"]).mean().round(2))
months_list = [1,2,3,4,5,6,7,8,9,10,11,12]
month_2017['month'] = months_list
month_2017 = month_2017[["year", "average_price", "month"]]
month_2017

#Task 5: scatter plot with regression to predict 2021 average price 

In [None]:
#group data by year and create year specific dataframes
data_2018 = income_avocado[income_avocado["year"] == 2018]
month_2018 = pd.DataFrame(data_2018)
month_2018 = pd.DataFrame(data_2018.groupby(["month"]).mean().round(2))
months_list = [1,2,3,4,5,6,7,8,9,10,11,12]
month_2018['month'] = months_list
month_2018 = month_2018[["year", "average_price", "month"]]
month_2018

In [None]:
data_2019 = income_avocado[income_avocado["year"] == 2019]
month_2019 = pd.DataFrame(data_2019)
month_2019 = pd.DataFrame(data_2019.groupby(["month"]).mean().round(2))
months_list = [1,2,3,4,5,6,7,8,9,10,11,12]
month_2019['month'] = months_list
month_2019 = month_2019[["year", "average_price", "month"]]
month_2019

In [None]:
#look at bar graphs comparing months
n = 12
width = 0.3
month_names2 = np.arange(n)

bar_2017 = month_2017["average_price"]
bar_2018 = month_2018["average_price"]
bar_2019 = month_2019["average_price"]

fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(month_names2+width, bar_2017, width, color='royalblue')
rects2 = ax.bar(month_names2+width, bar_2018, width, color='seagreen')
rects3 = ax.bar(month_names2+width, bar_2019, width, color='g')
plt.ylabel('Average Price (dollars)')
plt.title('Average Avocado Price vs. Month')
labels = ('2017', '2018', '2019')
plt.legend(labels=labels,loc='best')
plt.xticks(month_names2 + width / 2, ('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'), rotation=90)
plt.savefig('Output/question5graphics.png')
plt.show()

In [None]:
#create a bar graph comparing prices over the years
df = pd.DataFrame({'2017':bar_2017, '2018':bar_2018, '2019':bar_2019})
df.plot(kind='bar', stacked=False)
plt.title('Average Avocado Price vs. Month')
plt.ylabel('Price (dollars)')

In [None]:
#create a bar graph for prices vs. month 2017
plt.bar(month_2017['month'], month_2017['average_price'], color='green', alpha=0.75)
plt.title('Average Avocado Price vs. Month 2017')
plt.ylabel('Price (dollars)')
plt.xlabel('months')
plt.savefig('Output/question5graphics1.png')
plt.show()

In [None]:
#create a bar graph for prices vs. month 2018
plt.bar(month_2018['month'], month_2018['average_price'], color='green', alpha=0.6)
plt.title('Average Avocado Price vs. Month 2018')
plt.ylabel('Price (dollars)')
plt.xlabel('months')
plt.savefig('Output/question5graphics2.png')
plt.show()

In [None]:
plt.bar(month_2019['month'], month_2019['average_price'], color='green', alpha=0.5)
plt.title('Average Avocado Price vs. Month 2019')
plt.ylabel('Price (dollars)')
plt.xlabel('months')
plt.savefig('Output/question5graphics3.png')
plt.show()

In [None]:
#show all scatter plots of year vs. average prices
plt.scatter(month_2017['month'], month_2017['average_price'])
plt.scatter(month_2018['month'], month_2018['average_price'])
plt.scatter(month_2019['month'], month_2019['average_price'])
plt.title('Average Price of Avocados vs. Months 2017-2019')
plt.xlabel('Months 2017-2019')
plt.ylabel('average price ($)')
plt.grid()
labels= ('2017', '2018', '2019')
plt.legend(labels=labels,loc='best')
plt.savefig('Output/question5graphicsscatter.png')
plt.show()

In [None]:
#linear regress 2017
x_values = month_2017['month']
y_values = month_2017['average_price']
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)

line_eq = "y= " + str(round(slope,4)) + "x + " + str(round(intercept,2))

res = st.linregress(month_2017['month'], month_2017['average_price'])
print(f'The r-squared value is: {res.rvalue**2}')
print(f'The r-value is: {res.rvalue}')

plt.scatter(month_2017['month'], month_2017['average_price'], edgecolors='black')
plt.title('Average Price of Avocados vs. Months 2017')
plt.xlabel('Months in 2017')
plt.ylabel('average price ($)')
plt.grid()
plt.annotate(line_eq, (2,1.7), fontsize=12, color='red')
plt.plot(month_2017['month'], res.intercept + res.slope*month_2017['month'], 'r', label='fitted line')
plt.savefig('Output/question5graphicsscatter2017.png')
plt.show()

In [None]:
#linear regress 2018
x_values = month_2018['month']
y_values = month_2018['average_price']
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)

line_eq = "y= " + str(round(slope,4)) + "x + " + str(round(intercept,2))

res = st.linregress(month_2018['month'], month_2018['average_price'])
print(f'The r-squared value is: {res.rvalue**2}')
print(f'The r-value is: {res.rvalue}')

plt.scatter(month_2018['month'], month_2018['average_price'], edgecolors='black')
plt.title('Average Price of Avocados vs. Months 2018')
plt.xlabel('Months in 2018')
plt.ylabel('average price ($)')
plt.grid()
plt.annotate(line_eq, (2,1.40), fontsize=12, color='red')
plt.plot(month_2018['month'], res.intercept + res.slope*month_2018['month'], 'r', label='fitted line')
plt.savefig('Output/question5graphicsscatter2018.png')
plt.show()

In [None]:
#linear regress 2019
#Task 5: scatter plot with regression to predict 2021 average price 
x_values = month_2019['month']
y_values = month_2019['average_price']
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
desired_prediction_month = 5
line_eq = "y= " + str(round(slope,4)) + "x + " + str(round(intercept,2))

res = st.linregress(month_2019['month'], month_2019['average_price'])
print(f'The r-squared value is: {res.rvalue**2}')
print(f'The r-value is: {res.rvalue}')
print(f'Based on our linear regression we expect avocados to cost $ {round((res.slope*desired_prediction_month) + res.intercept, 2)} in May 2021')
plt.scatter(month_2019['month'], month_2019['average_price'], edgecolors='black')
plt.title('Average Price of Avocados vs. Months 2019')
plt.xlabel('Months in 2019')
plt.ylabel('average price ($)')
plt.grid()
plt.annotate(line_eq, (2,1.6), fontsize=12, color='red')
plt.plot(month_2019['month'], res.intercept + res.slope*month_2019['month'], 'r', label='fitted line')
plt.savefig('Output/question5graphicsscatter2019.png')
plt.show()

In [None]:
#Question 6: Do different types of avocados have different price trends? Arianne

#Task 6: facet (multiple lines on one) line graph (x-values = year, y-values = average price) for each type

In [None]:
#look at volume for 4046
grouped_by_month_year['4046'].plot(marker='^', color='forestgreen')
plt.xlabel("Month by Year")
plt.xticks(rotation=45)
plt.ylabel("Volume of items purchased")
plt.title("Volume over time for PLU 4046")
plt.savefig("Output/PLU4046.png")
plt.show()

In [None]:
#look at volume for 4225
grouped_by_month_year['4225'].plot(marker='8', color='lime')
plt.xlabel("Month by Year")
plt.xticks(rotation=45)
plt.ylabel("Volume of items purchased")
plt.title("Volume over time for PLU 4225")
plt.savefig("Output/PLU4225.png")
plt.show()

In [None]:
#Look at Volume for 4770
grouped_by_month_year['4770'].plot(marker='d', color='mediumseagreen')
plt.xlabel("Month by Year")
plt.xticks(rotation=45)
plt.ylabel("Volume of items purchased")
plt.title("Volume over time for PLU 4770")
plt.savefig("Output/PLU4770.png")
plt.show()

In [None]:
#Combine single charts into one main comparison chart
grouped_by_month_year['4046'].plot(marker='^', color='forestgreen')
grouped_by_month_year['4225'].plot(marker='8', color='lime')
grouped_by_month_year['4770'].plot(marker='d', color='mediumseagreen')
plt.legend()
plt.title("Volume Comparison over PLU")
plt.xlabel("Month by Year")
plt.xticks(rotation=45)
plt.ylabel("Volume of Items purchased")
plt.savefig("Output/combined_volume.png")
plt.show()