In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


# **Linear program to optimize the selection and purchase of top rated wines. The goal of this notebook is to develop and apply a linear program to optimize the selection (purchase) of top rated wine within a budget.**

In [None]:
#create a dataframe from the wine csv
wine = pd.DataFrame.from_csv('../input/winemag-data-130k-v2.csv')
wine.shape

In [None]:
wine.head(5)

In [None]:
# reduce the data set for the fields relevant to the linear program
wine_reduced = wine.filter(['variety','winery','title', 'country','province','region_1','region_2','points','price' ], axis=1)
wine_reduced.head()

In [None]:
#lets see the distribution of points to price

plt.figure(figsize = (10, 6))
box = sns.boxplot(x='points', y='price', data=wine_reduced)
plt.setp(box.artists, alpha=.5, linewidth=2, edgecolor="k")
plt.xticks(rotation=45)
box

#Based on the boxplot, there does seem to be an overall trend that higher rated wines also cost more.

In [None]:
#scatterplot will also help us understand the relationship between points and price

x = wine_reduced['points']
y = wine_reduced['price']
plt.scatter(x, y)
plt.show()


wine_reduced.corr()

Scattter plot shows a weak relationship between the two. Correlation below provides evidence of a weak linear relationship
of price and points. At face value, the above visualizations show that a simple linear regression model to predict price of a bottle of wine based on the points will provide minimal value. 

Of course, you can always transform the data, but that got me thinking about what other methods can generate value.
One idea, use Linear Programming to build an optimization model to maximize ratings of purchased wine, BUT within a budget
Buying the best wine is always the goal, but there is always a budget constraint. 

In [None]:
#Lets cleanup the dataset. Because we need price and points. We will have to drop missing values before we do that, lets determine if there are duplicate rows
wine_reduced['dupes'] = wine_reduced.duplicated()

In [None]:
pd.value_counts(wine_reduced['dupes'].values, sort=False)

In [None]:
#Result of the dupes
wine_reduced.loc[wine_reduced['dupes'] == True]

In [None]:
#remove any duplicates from variety, winery and title fields
wine_dedup = wine_reduced.drop_duplicates(['variety', 'winery', 'title'])

In [None]:
#lets see if we eliminated all the dupes. We should only see False in the array
wine_dedup.dupes.unique()

In [None]:
#lets remove the NAs. 
wine_dedup.replace('', np.nan)
wine_clean = wine_dedup.dropna(subset=['points','price'], how = 'any')
wine_clean.head(20)

In [None]:
#lets drop the dupes field
wine_clean = wine_clean.drop('dupes',1)
wine_clean[:5]


In [None]:
#test model on first 50 rows
model_test = wine_clean[:50].copy()
model_test['price'] = model_test.price.astype(int)

In [None]:
#What is the dimension of our final data frame
model_test.shape

In [None]:
from scipy import optimize

In [None]:
# Note that since linprog only solves minimization problems, that sign of the cost function is inverted.
result = optimize.linprog(
    c = model_test['points']*-1, 
    A_ub=model_test['price'], 
    b_ub=[100],
    bounds=(0,1),
    method='simplex'
)
result.message

In [None]:
model_test['buy'] = result.x

In [None]:
model_test[model_test['buy']==1]

In [None]:
print ("Total Monies Spent: " + " " + str(model_test[model_test['buy']==1].price.sum()))

In [None]:
print ("The number of bottles of wine purchased:" + " " + str(len(model_test[model_test['buy']==1].index)))

**California Wine Mixer**. Now that we know the model works. Let's set up a scenario where we only want to purchase California Wine. We also want to limit the wine selection to the more popular wine varieties: Chardonnay, Cabernet Sauvignon, Pinot Gris/Grigio, Red Blend, Merlot, Pinot Noir and Rosé (see http://www.wineinstitute.org/resources/pressroom/05012017). We will also limit the dataset to wines 95pts and above to make the model really work. The objective will be the same, maximize the points/ratings of wines purchased, but stay within a budget. 

In [None]:
cali = wine_clean[wine_clean['province']=='California']
cali = cali[cali['points'] >= 95]

In [None]:
cali.head(10)

In [None]:
#select the wine varities
cali = cali.loc[(cali['variety']=='Chardonnay') | (cali['variety'] == "Cabernet Sauvignon") |(cali['variety'] == "Pinot Gris") |  
           (cali['variety'] == "Pinot Grigio") | (cali['variety'] == "Red Blend") | (cali['variety'] == "Red Blends") | 
           (cali['variety'] == "Merlot") | (cali['variety'] == "Pinot Noir") | (cali['variety'] == "Rosé") ]

In [None]:
cali.variety.unique()

Looks like we we lost Pinot Gris/Grigio when we flitered to California wines 95pts and above. 

In [None]:
cali.shape

In [None]:
#lets set up our model and set a budget for 1000. This is going to be an awesome party!
result = optimize.linprog(
    c = cali['points']*-1, 
    A_ub=cali['price'], 
    b_ub=[1000],
    bounds=(0,1),
    method='simplex'
)
result.message

In [None]:
cali['buy'] = result.x

In [None]:
print ("Total Monies Spent :" + " " + str(cali[cali['buy']==1].price.sum()))

In [None]:
cali[cali['buy']==1]

In [None]:
print ("The number of bottles of wine purchased:" + " " + str(len(cali[cali['buy']==1].index)))