In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import keila_wrangle as w

# Project Description
We will be analyzing, factors that affect wine quality. Our objective is to build a predictive model that can estimate the quality of a given wine. The focus here is to gain insights and understanding about the factors influencing wine quality rather than generating practical predictions.

# Project Goal

* We will be analyzing, exploring the different attributes affecting red and white wine quality. 
* We will be creating a model based on to predict the quality of the wine.
* This data would not be used on future properties or for real life prediction.

# Initial Thoughts

The initial hypothesis for this project is that certain factors such as 'area', 'bathrooms', 'year', 'pool', 'bedroom', 'fips', and 'year' may be significant drivers of property value.


# Acquire
* Aquire dataset from Codeup database
* It contained 52,442 rows and 7 columns before cleaning
* Each row represents a property at Zillow
* Each column represents a feature of those properties

# Prepare
- Prepare Actions:
    * Filtered columns that did not contain useful information
    * Renamed columns to promote readability
    * Checked for nulls in the data. The following columns contained nulls:
        - 'pool' (41,346 nulls), nulls changed to 0
        - 'year' (116 nulls), these rows were dropped  
        - 'area' (82 nulls), these rows were dropped  
        - 'property_value' (1 null), these rows were dropped  
    * Checked that column data types were appropriate
    * Added full_bath column, which divides the type of bathrooms into homes with only full bathrooms and half bathrooms and including one home that has a three-quarters bath.
    * Encoded categorical variables
    * Split data into train, validate and test (approx. 60/20/20)
    * 2,641 outliers have been removed for the iteration of the project, based on if they fell outside the 3 standard deviation

In [2]:
# get dataset
df = w.read_wine()
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red_wine,wine_type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1,red
