In [None]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
# nltk.download()

import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon

from scipy import stats

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score

pd.set_option('display.max_columns', None)

In [None]:
wine = pd.read_csv('redwine.csv', delimiter=';')
chemColNames = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']
chem = wine[chemColNames]

colErrorPairs = {
    'density'    : [' . '],
    'citric acid': [' - ',' -   '],
    'alcohol'    : ['100.333.333.333.333','11.066.666.666.666.600','956.666.666.666.667','923.333.333.333.333']}

for colName in colErrorPairs:
    for faultyString in colErrorPairs[colName]:
        wine[colName] = wine[colName].replace(faultyString,np.nan)
        
wine['alcohol'] = wine['alcohol'].astype(float)
wine['density'] = wine['density'].astype(float)
wine['citric acid'] = wine['citric acid'].astype(float)

wine['taster_name'] = wine['taster_name'].apply(lambda name: name.split(" ")[0])

In [None]:
name = pd.read_csv('names.csv', usecols=['Name','Gender', 'Year','Count'])

# Onderzoeksvraag 2: Als we wijnen categoriseren op basis van de chemische samenstelling, zijn er bepaalde categorieën die mannelijke proevers anders beoordelen dan vrouwelijke proevers?


We willen bij deze onderzoeksvraag mannelijke en vrouwelijke proevers vergelijken door te kijken of de ene groep een bepaalde voorkeur heeft voor een bepaalde categorie wijn.

In [None]:
# name = name[name['Year'] >= 2014]
name

In [None]:
name.sort_values(by=['Count'],ascending=False, inplace=True)
name.drop_duplicates(subset=['Name'], keep='first', inplace=True)
name.sort_values(by=['Name'], inplace=True)

Nu gaan we onze twee datasets mergen. We doen een inner join op de twee kolommen waar de namen in staan en printen de eerste 5 records. 

In [None]:
merge = wine.merge(name, left_on='taster_name', right_on='Name', how="left")
merge.drop(['Count','Name','Year'], axis = 1, inplace=True)
merge.head()

Nu printen we de tabel waarbij we de naam van de proever en het geslacht tonen en het aantal wijnen dat ze hebben geproeft.

In [None]:
nameCount = merge.groupby(['taster_name', 'Gender']).size().reset_index(name='count')
nameCount.sort_values(by='count',ascending=False)

In [None]:
count = merge['Gender'].value_counts()
count.plot.pie(y ='Gender', figsize = (5,5), colors = ['lightblue', 'pink'])

## Clusteren


In [None]:
chemColNames = ['pH','fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','sulphates','alcohol']

In [None]:
wineZscore = merge.copy()

In [None]:
for col in chemColNames:
    wineZscore[col] = (wine[col] - wine[col].mean())/wine[col].std(ddof=0)


In [None]:
wineZscore = wineZscore.dropna(subset=chemColNames)
chemZscore = wineZscore[chemColNames]
chemZscore.shape, wineZscore.shape

In [None]:
kMeansData = chemZscore.copy()
kMeansData.head(10)

In [None]:
kMeansData = kMeansData.values

In [None]:
n_cluster = 20

In [None]:
fig, (kmeansClustorPlot) = plt.subplots()
fig.set_size_inches(18, 7)

kmeans = KMeans(n_clusters=n_cluster, random_state=10)
kmeans_cluster_labels = kmeans.fit_predict(kMeansData)


colors = mpl.cm.nipy_spectral(kmeans_cluster_labels.astype(float) / n_cluster)
kmeansClustorPlot.scatter(kMeansData[:,0], kMeansData[:,1],marker='.', s=30, lw=0, alpha=0.7, c = colors, edgecolor='k')

centers = kmeans.cluster_centers_
kmeansClustorPlot.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200, edgecolor='k')
kmeans.score(kMeansData)

In [None]:
wineZscore['kmeans_cluster'] = kmeans_cluster_labels

In [None]:
gmm = GaussianMixture(n_components=n_cluster, random_state=10, covariance_type='full')
gmm_cluster_labels = gmm.fit_predict(kMeansData)

# centers = gmm.cluster_centers_

fig, (gmmClustorPlot) = plt.subplots()
fig.set_size_inches(18, 7)

colors = mpl.cm.nipy_spectral(gmm_cluster_labels.astype(float) / n_cluster)

gmmClustorPlot.scatter(kMeansData[:,0], kMeansData[:,1],marker='.', s=30, lw=0, alpha=0.7, c = colors, edgecolor='k')
# gmmClustorPlot.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200, edgecolor='k')
gmm.score(kMeansData)

In [None]:
relevant = ['id','points','price','taster_name','Gender','kmeans_cluster','gmm_cluster']
wineZscore['gmm_cluster'] = gmm_cluster_labels
wineZscore[relevant].head()

In [None]:
# wineZscore.groupby('kmeans_cluster').filter(like='M').count()
totalAmount = wineZscore.shape[0]
for i in range(n_cluster):
    m = wineZscore['Gender'][(wineZscore['Gender'] == "M") & (wineZscore['kmeans_cluster'] == i)].count()
    f = wineZscore['Gender'][(wineZscore['Gender'] == "F") & (wineZscore['kmeans_cluster'] == i)].count()
    
    mScore = wineZscore['points'][(wineZscore['Gender'] == "M") & (wineZscore['kmeans_cluster'] == i)].sum()
    fScore = wineZscore['points'][(wineZscore['Gender'] == "F") & (wineZscore['kmeans_cluster'] == i)].sum()
    
    clusterScore = wineZscore['points'][(wineZscore['kmeans_cluster'] == i)].sum()
    
    tasterCount = m + f
    print(f'Kmeans cluster: {i} Count Males: {m}/ Females: {f}')
    print(f'Average cluster score: {clusterScore/tasterCount}')
    print(f'Per gender score: Males: {mScore/m},Females: {fScore/f}\n')

# wineZscore['Gender'][(wineZscore['Gender'] == "M") & (wineZscore['kmeans_cluster'] == 3)].count()
# wineZscore[(wineZscore['Gender'] == "F")].count()

In [None]:
for i in range(n_cluster):
    m = wineZscore['Gender'][(wineZscore['Gender'] == "M") & (wineZscore['gmm_cluster'] == i)].count()
    f = wineZscore['Gender'][(wineZscore['Gender'] == "F") & (wineZscore['gmm_cluster'] == i)].count()
    
    mScore = wineZscore['points'][(wineZscore['Gender'] == "M") & (wineZscore['gmm_cluster'] == i)].sum()
    fScore = wineZscore['points'][(wineZscore['Gender'] == "F") & (wineZscore['gmm_cluster'] == i)].sum()
    
    clusterScore = wineZscore['points'][(wineZscore['gmm_cluster'] == i)].sum()
    
    tasterCount = m + f
    print(f'Kmeans cluster: {i} Count Males: {m}/ Females: {f}')
    print(f'Average cluster score: {clusterScore/tasterCount}')
    print(f'Per gender score: Males: {mScore/m},Females: {fScore/f}\n')