# Wine Descriptions Analysis
## Data Cleaning
### Created by: Elliot Pack 
#### November 2019

In [158]:
# Import packages to analyze text
import nltk 
import re
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

In [159]:
# Open data and limit columns to Variety, Description
import pandas as pd
import numpy as np

df = pd.read_csv("wine_data.csv")
df = df[['variety','description']]

In [160]:
# Remove the description author's name (last two words of description)
def remove_author(text):
    text = text.rsplit(' ', 2)[0]
    return text

df.description = df.description.apply(remove_author)

In [161]:
# Limit wine varieties
include_varieties = ['Cabernet Sauvignon','Chardonnay','Pinot Gris','Sauvignon Blanc','Merlot']
df = df[df.variety.isin(include_varieties)]

In [162]:
# Condense DataFrame by variety
df = df.groupby(['variety'])['description'].apply(' '.join).reset_index()

In [163]:
# Data cleaning to make text lowercase, remove punctuation, etc. 

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', ' ', text) 
    text = re.sub('[‘’“”…]', ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('–', ' ', text)
    text = re.sub('—', ' ', text) 
    return text

round1 = lambda x: clean_text_round1(x)
df.description = df.description.apply(round1)

In [164]:
# Pull in list of stop words

stop_words = stopwords.words('english')

# Manually add some words that will not be meaningful in differentiating wines 

remove_words = pd.read_csv("remove_words.csv")
remove_words = remove_words['remove_words'].values
for words in remove_words:
    stop_words.append(words)

# Second round of data cleaning to remove stop words and perform stemming
stemming = PorterStemmer()

def clean_text_round_2(row):
    description = row['description']
    tokens = nltk.word_tokenize(description)
    token_words = [w for w in tokens if w.isalpha()]
    stemmed_words = [stemming.stem(w) for w in token_words]
    meaningful_words = [w for w in token_words if not w in stop_words]
    #joined_words = (" ".join(meaningful_words))
    return meaningful_words

df['meaningful_words'] = df.apply(clean_text_round_2, axis=1)

In [165]:
# Save dataframe for later use

df.to_csv('cleaned_wine_data.csv')

In [166]:
# Create a column with the most commonly used words 
from nltk import FreqDist

df = pd.read_csv('cleaned_wine_data.csv')

def max_freq(row):
    text = row['meaningful_words']
    fdist = FreqDist(text)
    return [(word, freq) for word, freq in fdist.most_common(10)]

df['common_descriptors'] = df.apply(max_freq, axis=1)

In [167]:
df = df[['variety','common_descriptors']]

In [168]:

final_table=pd.DataFrame()
for i in range (0,len(df.index)):
    data = df_test.iloc[i,1]
    variety = df_test.iloc[i,0]
    table = pd.DataFrame(data, columns = ['Word', 'Frequency'])
    table['Variety'] = variety
    table = table[['Variety','Word','Frequency']]
    final_table=final_table.append(table,ignore_index=True)

In [169]:
final_table

Unnamed: 0,Variety,Word,Frequency
0,Bordeaux Blanc,herbal,9
1,Bordeaux Blanc,crispness,9
2,Bordeaux Blanc,tight,9
3,Bordeaux Blanc,acidity,9
4,Bordeaux Blanc,zesty,9
5,Bordeaux Blanc,texture,9
6,Bordeaux Blanc,fruitiness,9
7,Bordeaux Blanc,ready,9
8,Cabernet Sauvignon,black,81
9,Cabernet Sauvignon,blackberry,63


In [170]:
import plotly
import plotly.express as px
fig = px.line_polar(final_table, r="Frequency", theta="Word", color="Variety", line_close=True,
                    color_discrete_sequence=px.colors.sequential.Plasma[-2::-1],
                    template="seaborn")


fig.show()