In [None]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
# nltk.download()

import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon

from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

pd.set_option('display.max_columns', None)

In [None]:
wine = pd.read_csv('redwine.csv', delimiter=';')
chemColNames = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']
chem = wine[chemColNames]

colErrorPairs = {
    'density'    : [' . '],
    'citric acid': [' - ',' -   '],
    'alcohol'    : ['100.333.333.333.333','11.066.666.666.666.600','956.666.666.666.667','923.333.333.333.333']}

for colName in colErrorPairs:
    for faultyString in colErrorPairs[colName]:
        wine[colName] = wine[colName].replace(faultyString,np.nan)
        
wine['alcohol'] = wine['alcohol'].astype(float)
wine['density'] = wine['density'].astype(float)
wine['citric acid'] = wine['citric acid'].astype(float)

wine['taster_name'] = wine['taster_name'].apply(lambda name: name.split(" ")[0])

In [None]:
name = pd.read_csv('names.csv', usecols=['Name','Gender', 'Year','Count'])

# Onderzoeksvraag 2: Als we wijnen categoriseren op basis van de chemische samenstelling, zijn er bepaalde categorieën die mannelijke proevers anders beoordelen dan vrouwelijke proevers?


We willen bij deze onderzoeksvraag mannelijke en vrouwelijke proevers vergelijken door te kijken of de ene groep een bepaalde voorkeur heeft voor een bepaalde categorie wijn.

In [None]:
# name = name[name['Year'] >= 2014]
name

In [None]:
name.sort_values(by=['Count'],ascending=False, inplace=True)
name.drop_duplicates(subset=['Name'], keep='first', inplace=True)
name.sort_values(by=['Name'], inplace=True)

Nu gaan we onze twee datasets mergen. We doen een inner join op de twee kolommen waar de namen in staan en printen de eerste 5 records. 

In [None]:
merge = wine.merge(name, left_on='taster_name', right_on='Name', how="left")
merge

Nu printen we de tabel waarbij we de naam van de proever en het geslacht tonen. 

In [None]:
tasters = merge[['taster_name', 'Gender']]
tasters.head()

In [None]:
count = tasters['Gender'].value_counts()

In [None]:
count.plot.pie(y ='Gender', figsize = (5,5), colors = ['lightblue', 'pink'])

## Clusteren


In [None]:
chemColNames = ['pH','fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','sulphates','alcohol']



In [None]:
wineZscore = wine.copy()

In [None]:
for col in chemColNames:
    wineZscore[col] = (wine[col] - wine[col].mean())/wine[col].std(ddof=0)


In [None]:
chemZscore = wineZscore[chemColNames]

In [None]:
kMeansData = chemZscore.copy()

In [None]:
kMeansData = kMeansData.dropna()

In [None]:
kMeansData.head(10)

In [None]:
from sklearn.cluster import KMeans

In [None]:
KMeans(n_clusters=3, random_state=0).fit(kMeansData)

In [None]:
kMeansPlotData = kMeansData.copy()
kMeansPlotData = kMeansPlotData.values
kMeansPlotData

In [None]:
n_cluster = 3

In [None]:
fig, (kmeansClustorPlot) = plt.subplots()
fig.set_size_inches(18, 7)

clusterer = KMeans(n_clusters=n_cluster, random_state=10)
cluster_labels = clusterer.fit_predict(kMeansPlotData)


colors = mpl.cm.nipy_spectral(cluster_labels.astype(float) / n_cluster)
kmeansClustorPlot.scatter(kMeansPlotData[:,0], kMeansPlotData[:,1],marker='.', s=30, lw=0, alpha=0.7, c = colors, edgecolor='k')

centers = clusterer.cluster_centers_
kmeansClustorPlot.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200, edgecolor='k')


plt.show()