In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import keila_wrangle as w

In [3]:
red = pd.read_csv('winequality-red.csv')

white = pd.read_csv('winequality-white.csv')

In [4]:
train, validate, test = w.clean_wine()

46 outliers removed from fixed acidity.
26 outliers removed from volatile acidity.
11 outliers removed from citric acid.
5 outliers removed from residual sugar.
63 outliers removed from chlorides.
15 outliers removed from free sulfur dioxide.
2 outliers removed from total sulfur dioxide.
0 outliers removed from density.
4 outliers removed from pH.
21 outliers removed from sulphates.
0 outliers removed from alcohol.
0 outliers removed from quality.
0 outliers removed from red_wine.

Total of 193 outliers removed.


In [7]:
target = "quality"
train, validate, test, x_train, y_train, x_validate, y_validate, x_test, y_test = w.split_data_xy(train, validate, test, target)

train -> (3782, 14)
validate -> (1261, 14)
test -> (1261, 14)


In [8]:
x_train_scaled, x_validate_scaled, x_test_scaled = w.mm_scale(x_train, x_validate, x_test)

In [9]:
x_train_scaled

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,red_wine
0,0.662791,0.144444,0.546512,0.028169,0.155172,0.20,0.393939,0.557034,0.159292,0.488372,0.250000,0.0
1,0.337209,0.244444,0.337209,0.098592,0.074713,0.36,0.340067,0.283904,0.522124,0.476744,0.550000,0.0
2,0.139535,0.255556,0.000000,0.267606,0.195402,0.42,0.538721,0.436629,0.513274,0.476744,0.316667,0.0
3,0.372093,0.055556,0.430233,0.570423,0.172414,0.35,0.333333,0.551331,0.292035,0.383721,0.450000,0.0
4,0.558140,0.155556,0.418605,0.056338,0.298851,0.52,0.239057,0.565906,0.663717,0.755814,0.500000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3777,0.290698,0.655556,0.558140,0.558685,0.229885,0.56,0.727273,0.683777,0.398230,0.348837,0.216667,0.0
3778,0.569767,0.077778,0.348837,0.042254,0.195402,0.28,0.417508,0.449303,0.442478,0.186047,0.300000,0.0
3779,0.232558,0.200000,0.348837,0.089202,0.126437,0.74,0.414141,0.195817,0.424779,0.186047,0.583333,0.0
3780,0.255814,0.266667,0.534884,0.037559,0.218391,0.55,0.616162,0.378327,0.460177,0.313953,0.266667,0.0


In [None]:
red.shape

In [None]:
train, vali, t = w.clean_wine()

In [None]:
train

In [None]:
1599 + 4898



In [None]:
6497 - 193


In [None]:
def read_wine():
    red = pd.read_csv('winequality-red.csv')
    white = pd.read_csv('winequality-white.csv')
    return red, white

In [None]:
df = read_wine()

In [None]:
# get datasets 
red, white = read_wine()

# create columns to seperate wine types --  encode
red['red_wine'] = 1
white['red_wine'] = 0

red['wine_type'] = 'red'
white['wine_type'] = 'white'
# combine red & white wine dataset
df = pd.concat([red, white])

# reset index
df.reset_index(drop=False, inplace=True)

In [None]:
df

In [None]:
def remove_outliers(df, exclude_column=[], sd=4):
    """
    Remove outliers from a pandas DataFrame using the Z-score method.
    
    Args:
    df (pandas.DataFrame): The DataFrame containing the data.
    
    Returns:
    pandas.DataFrame: The DataFrame with outliers removed.
    """
    num_outliers_total = 0
    for column in df.columns:
        if column == exclude_column:
            continue
        series = df[column]
        z_scores = np.abs(stats.zscore(series))
        num_outliers = len(z_scores[z_scores > sd])
        num_outliers_total += num_outliers
        df = df[(z_scores <= sd) | pd.isnull(df[column])]
        print(f"{num_outliers} outliers removed from {column}.")
    print(f"\nTotal of {num_outliers_total} outliers removed.")
    return df

In [None]:
def clean_wine():
    # get datasets 
    red, white = read_wine()
    
    # create columns to seperate wine types --  encode
    red['red_wine'] = 1
    white['red_wine'] = 0

    red['wine_type'] = 'red'
    white['wine_type'] = 'white'
    # combine red & white wine dataset
    df = pd.concat([red, white])
    
    # reset index


    
    # remove outliers -- removed outliers outside of 4 standard deviation
    df = remove_outliers(df, 'wine_type')
#     df.reset_index(drop=False, inplace=True)
    
#     df = df.drop(columns=["index"])
    
    # fix names for columns
    new_col_name = []
    
    for col in df.columns:
        new_col_name.append(col.lower().replace(' ', '_'))

    df.columns = new_col_name
    

    return df



In [None]:
df = clean_wine()

In [None]:
df.describe()

In [None]:
df = clean_wine()

In [None]:
df

In [None]:
df.reset_index(drop=True, inplace=True)


In [None]:
split_data(df)

In [None]:
# rows & columns
red.shape

In [None]:
# rows & columns
white.shape

In [None]:
df = pd.concat([red, white])

In [None]:
df.info()

In [None]:
# found no nulls 
df[df.isnull()].sum()

In [None]:
df.head()

In [None]:
def outlier(df, feature, m=1.5):
    '''
    outlier will take in a dataframe's feature:
    - calculate it's 1st & 3rd quartiles,
    - use their difference to calculate the IQR
    - then apply to calculate upper and lower bounds
    - using the `m` multiplier
    '''
    q1 = df[feature].quantile(.25)
    q3 = df[feature].quantile(.75)
    
    iqr = q3 - q1
    
    upper_bound = q3 + (m * iqr)
    lower_bound = q1 - (m * iqr)
    
    return upper_bound, lower_bound

In [None]:
for col in df.columns[:-1]:
    upper_bound, lower_bound = outlier(df, col)
    print(col)
    print(upper_bound)
    print(lower_bound)
    
# there arent any crazy outliers -- leave in dataset

In [None]:
def get_object_cols(df):
    '''
    This function takes in a dataframe and identifies the columns that are object types
    and returns a list of those column names. 
    '''
    # get a list of the column names that are objects (from the mask)
    object_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    return object_cols



def get_numeric_cols(df):
    '''
    This function takes in a dataframe and identifies the columns that are object types
    and returns a list of those column names. 
    '''
    # get a list of the column names that are objects (from the mask)
    num_cols = df.select_dtypes(exclude=['object', 'category']).columns.tolist()
    
    return num_cols


In [None]:
def summarize(df):
    '''
    summarize will take in a single argument (a pandas dataframe) 
    and output to console various statistics on said dataframe, including:
    # .head()
    # .info()
    # .describe()
    # .value_counts()
    # observation of nulls in the dataframe
    # distribution of numerical attributes
    '''
    print(f"""SUMMARY REPORT
=====================================================
          
          
Dataframe head: 
{df.head(3)}
          
=====================================================
          
          
Dataframe info: """)
    df.info()

    print(f"""=====================================================
          
          
Dataframe Description: 
{df.describe().T}
          
=====================================================

    
    
DataFrame value counts: 
 """)         
    for col in (get_object_cols(df)): 
        print(f"""******** {col.upper()} - Value Counts:
{df[col].value_counts()}
    _______________________________________""")                   
        
    for col in df.columns:
        fig, ax = plt.subplots(figsize=(6, 4))
        sns.histplot(df[col], ax=ax)
        ax.set_title(f'Histogram of {col}')
        plt.show()

In [None]:
w.summarize(df)

* sulphates, density, total sulfur dioxide, free sulfur dioxide, chlorides, residual sugar, citric acid, volatile acidity   has outliers

In [None]:
from scipy import stats
# removed outliers outside 4 standard deviation
def remove_outliers(df, exclude_column=[], sd=4):
    """
    Remove outliers from a pandas DataFrame using the Z-score method.
    
    Args:
    df (pandas.DataFrame): The DataFrame containing the data.
    
    Returns:
    pandas.DataFrame: The DataFrame with outliers removed.
    """
    num_outliers_total = 0
    for column in df.columns:
        if column == exclude_column:
            continue
        series = df[column]
        z_scores = np.abs(stats.zscore(series))
        num_outliers = len(z_scores[z_scores > sd])
        num_outliers_total += num_outliers
        df = df[(z_scores <= sd) | pd.isnull(df[column])]
        print(f"{num_outliers} outliers removed from {column}.")
    print(f"\nTotal of {num_outliers_total} outliers removed.")
    return df

In [None]:
# handle outliers
df = w.remove_outliers(df, 'wine_type')

In [None]:
def analyze_missing_values(df):
    """
    Analyzes missing values in a dataframe and returns a summary dataframe.

    Args:
        df (pandas.DataFrame): The input dataframe containing observations and attributes.

    Returns:
        pandas.DataFrame: A dataframe with information about missing values for each attribute.
            The index represents attribute names, the first column contains the number of rows
            with missing values for that attribute, and the second column contains the percentage
            of total rows that have missing values for that attribute.
    """
    missing_counts = df.isnull().sum()
    total_rows = len(df)
    missing_percentages = (missing_counts / total_rows) * 100
    
    missing_data_df = pd.DataFrame({'Missing Count': missing_counts, 'Missing Percentage': missing_percentages})
    missing_data_df.index.name = 'Attribute'
    
    return missing_data_df


In [None]:
summarize(df)

In [None]:
analyze_missing_values(df)

In [None]:
def handle_missing_values(df, prop_required_column = .5, prop_required_row = .75):
    """
    Drops rows and columns from a dataframe based on the proportion of missing values.

    Args:
        df (pandas.DataFrame): The input dataframe.
        prop_required_column (float, optional): The proportion of non-missing values required for each column.
            Defaults to 0.5.
        prop_required_row (float, optional): The proportion of non-missing values required for each row.
            Defaults to 0.75.

    Returns:
        pandas.DataFrame: The modified dataframe with dropped columns and rows.

    Raises:
        None

    Example:
        modified_df = handle_missing_values(df, prop_required_column=0.6, prop_required_row=0.8)
    """
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df

In [None]:
handle_missing_values(df, prop_required_column = .5, prop_required_row = .75)

In [None]:
# rename columns add underscores
new_col_name = []

for col in df.columns:
    new_col_name.append(col.lower().replace(' ', '_'))

df.columns = new_col_name

In [None]:
df = df.reset_index().drop(columns=['index'])

In [None]:
df

In [None]:
df.quality.value_counts()

In [None]:

def nulls_by_row(df, index_id = 'id'):
    """
    """
    num_missing = df.isnull().sum(axis=1)
    pct_miss = (num_missing / df.shape[1]) * 100
    
    rows_missing = pd.DataFrame({'num_cols_missing': num_missing, 'percent_cols_missing': pct_miss})

    rows_missing = df.merge(rows_missing,
                        left_index=True,
                        right_index=True).reset_index()[[index_id, 'num_cols_missing', 'percent_cols_missing']]
    return rows_missing.sort_values(by='num_cols_missing', ascending=False)

In [None]:
row_missing = nulls_by_row(df, 'index')

In [None]:
row_missing

In [None]:
def split_data(df, stratify_name=None):
    '''
    Takes in two arguments the dataframe name and the ("stratify_name" - must be in string format) to stratify  and 
    return train, validate, test subset dataframes will output train, validate, and test in that order
    '''
    train, test = train_test_split(df, #first split
                                   test_size=.2, 
                                   random_state=123, 
                                   stratify=df[stratify_name])
    train, validate = train_test_split(train, #second split
                                    test_size=.25, 
                                    random_state=123,
                                    stratify=train[stratify_name])
    return train, validate, test

In [None]:
column_list = df.columns.tolist()
print(column_list)


In [None]:
df.reset_index

In [None]:
train, validate, test = split_data(df, stratify_name='quality')

In [None]:
df.reset_index(drop=True, inplace=True)


In [None]:
type(df)

In [None]:
train.shape

In [None]:
validate.shape

In [None]:
test.shape

In [None]:
def clean_wine():
    # get datasets 
    red, white = read_wine()
    
    # create columns to seperate wine types --  encode
    red['red_wine'] = 1
    white['red_wine'] = 0

    red['wine_type'] = 'red'
    white['wine_type'] = 'white'
    # combine red & white wine dataset
    df = pd.concat([red, white])
    
    # remove outliers -- removed outliers outside of 4 standard deviation
    df = remove_outliers(df, 'wine_type')
    
    # fix names for columns
    new_col_name = []
    
    for col in df.columns:
        new_col_name.append(col.lower().replace(' ', '_'))

    df.columns = new_col_name

    # split data 
    train, validate, test = split_data(df, "quality")
    
    return train, validate, test

In [None]:
train, validate, test = clean_wine()

In [None]:
train

In [None]:
def split_data_xy(train, validate, test, target):
    '''
    This function take in a dataframe performs a train, validate, test split
    Returns train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test
    and prints out the shape of train, validate, test
    '''
    #Split into X and y
    x_train = train.drop(columns=[target])
    y_train = train[target]

    x_validate = validate.drop(columns=[target])
    y_validate = validate[target]

    x_test = test.drop(columns=[target])
    y_test = test[target]

    # Have function print datasets shape
    print(f'train -> {train.shape}')
    print(f'validate -> {validate.shape}')
    print(f'test -> {test.shape}')
   
    return train, validate, test, x_train, y_train, x_validate, y_validate, x_test, y_test

In [None]:
target = "quality"
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test = split_data_xy(train, validate, test, target)

In [None]:
target = "quality"
train, validate, test, x_train, y_train, x_validate, y_validate, x_test, y_test = split_data_xy(train, validate, test, target)

In [None]:
from sklearn.preprocessing import MinMaxScaler


In [None]:
def rename_col(df, list_of_columns=[]): 
    '''
    Take df with incorrect names and will return a renamed df using the 'list_of_columns' which will contain a list of appropriate names for the columns  
    '''
    df = df.rename(columns=dict(zip(df.columns, list_of_columns)))
    return df

from sklearn.preprocessing import MinMaxScaler
def mm_scale(x_train, x_validate, x_test):
    """
    Apply MinMax scaling to the input data.

    Args:
        x_train (pd.DataFrame): Training data features.
        x_validate (pd.DataFrame): Validation data features.
        x_test (pd.DataFrame): Test data features.

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: Scaled versions of the input data
            (x_train_scaled, x_validate_scaled, x_test_scaled).
    """
    # remove string column wine_type
    keep_col = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'ph', 'sulphates', 'alcohol', 'red_wine']
    x_train, x_validate, x_test = x_train[keep_col], x_validate[keep_col], x_test[keep_col]
    
    
    scaler = MinMaxScaler()
    scaler.fit(x_train)


    x_train_scaled = scaler.transform(x_train)
    x_validate_scaled = scaler.transform(x_validate)
    x_test_scaled = scaler.transform(x_test)

    col_name = list(x_train.columns)

    x_train_scaled, x_validate_scaled, x_test_scaled = pd.DataFrame(x_train_scaled), pd.DataFrame(x_validate_scaled), pd.DataFrame(x_test_scaled)
    x_train_scaled, x_validate_scaled, x_test_scaled  = rename_col(x_train_scaled, col_name), rename_col(x_validate_scaled, col_name), rename_col(x_test_scaled, col_name)
    
    return x_train_scaled, x_validate_scaled, x_test_scaled


In [None]:
x_train_scaled, x_validate_scaled, x_test_scaled = mm_scale(x_train, x_validate, x_test)

In [None]:
x_train_scaled