# Wine quality prediction

## Import and methods

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.decomposition import PCA

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import scipy

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

In [4]:
def loadData(directory):
    df = pd.read_csv(directory,sep="\t")
    return df

def get_final_csv(ids, y, filename):
    pd.DataFrame(dict(Id = ids,Predicted = y)).to_csv(filename,sep=",",index=False)

def evaluateModels(models,targets,X,y):
    
    scores = pd.DataFrame()
    
    for model,target in zip(models,targets):
        cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=42)
        scores[target] = cross_val_score(model, X, y, scoring='r2', cv=cv, n_jobs=-1)
        
        print(f"{target} ended up")
        
    return scores

"""
    1) OneHot encoding
"""
def preprocessingV1(X_d,X_e):
    
    X_d = X_d.drop(columns=["region_2","description"])
    X_e = X_e.drop(columns=["region_2","description"])

    imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    X_d["country"] = imputer.fit_transform(np.array(X_d["country"]).reshape(-1,1))
    X_d["province"] = imputer.fit_transform(np.array(X_d["province"]).reshape(-1,1))
    
    X_d = X_d.fillna("other")
    X_e = X_e.fillna("other")
    
    y = X_d.quality
    X_d = X_d.drop(columns=["quality"])
    
    df = pd.concat([X_d,X_e])
    
    df_enc = pd.get_dummies(df)
    df_enc_scipy = scipy.sparse.csr_matrix(df_enc.values)
    
    return df_enc_scipy[:X_d.shape[0]], y, df_enc_scipy[X_d.shape[0]:]

"""
    2) OneHot encoding without outliers
"""
def preprocessingV3(X_d,X_e):
    
    X_d = X_d.drop(columns=["region_2","description"])
    X_e = X_e.drop(columns=["region_2","description"])
    
    X_d = X_d.fillna("other")
    X_e = X_e.fillna("other")
    
    # 1.5(IQR) Rule for detecting the outliers
    t = X_d["quality"].quantile(0.75) - X_d["quality"].quantile(0.25)
    min_t = X_d["quality"].quantile(0.25) - 1.5 * t
    max_t = X_d["quality"].quantile(0.75) + 1.5 * t
    
    X_d_filtered = X_d[X_d["quality"] >= min_t]
    X_d_filtered = X_d_filtered[X_d_filtered["quality"] <= max_t]
    
    y = X_d_filtered.quality
    
    df = pd.concat([X_d_filtered,X_e])
    
    df_enc = pd.get_dummies(df)
    df_enc_scipy = scipy.sparse.csr_matrix(df_enc.values)
    
    return df_enc_scipy[:X_d_filtered.shape[0]], y, df_enc_scipy[X_d_filtered.shape[0]:]

"""
    3) OneHot encoding without outliers
"""
def preprocessingV3(X_d,X_e):
    
    X_d = X_d.drop(columns=["region_2","description"])
    X_e = X_e.drop(columns=["region_2","description"])
    
    X_d = X_d.fillna("other")
    X_e = X_e.fillna("other")
    
    # 1.5(IQR) Rule for detecting the outliers
    t = X_d["quality"].quantile(0.75) - X_d["quality"].quantile(0.25)
    min_t = X_d["quality"].quantile(0.25) - 1.5 * t
    max_t = X_d["quality"].quantile(0.75) + 1.5 * t
    
    X_d_filtered = X_d[X_d["quality"] >= min_t]
    X_d_filtered = X_d_filtered[X_d_filtered["quality"] <= max_t]
    
    y = X_d_filtered.quality
    
    df = pd.concat([X_d_filtered,X_e])
    
    df_enc = pd.get_dummies(df)
    df_enc_scipy = scipy.sparse.csr_matrix(df_enc.values)
    
    return df_enc_scipy[:X_d_filtered.shape[0]], y, df_enc_scipy[X_d_filtered.shape[0]:]

"""
    4) OneHot encoding without duplicates
"""
def preprocessingV4(X_d,X_e):
    
    X_d = X_d.drop(columns=["region_2","description"])
    X_e = X_e.drop(columns=["region_2","description"])
    
    X_d = X_d.fillna("other")
    X_d = X_d.drop_duplicates()
    X_e = X_e.fillna("other")
    
    y = X_d.quality
    X_d = X_d.drop(columns=["quality"])
    
    df = pd.concat([X_d,X_e])
    
    df_enc = pd.get_dummies(df)
    df_enc_scipy = scipy.sparse.csr_matrix(df_enc.values)
    
    return df_enc_scipy[:X_d.shape[0]], y, df_enc_scipy[X_d.shape[0]:]

"""
    5) OneHot encoding of the top 80% elements per feature
"""
def preprocessingV5(X_d,X_e,labels):
    
    X_d = X_d.drop(columns=["region_2","description"])
    X_e = X_e.drop(columns=["region_2","description"])
    
    X_d = X_d.fillna("other")
    X_e = X_e.fillna("other")
    
    y = X_d.quality
    X_d = X_d.drop(columns=["quality"])
    
    df = pd.concat([X_d,X_e])
    
    for label in labels:
        
        top_labels_index = df[label].value_counts().index
        top_labels_length = len(df[label].value_counts().index)
        thresh = round(top_labels_length * 80 / 100)
        thresh = top_labels_length
        
        top = df[label].isin(top_labels_index[:thresh])
        df.loc[~top, label] = "other"
    
    df_enc = scipy.sparse.csr_matrix(pd.get_dummies(df).values)
    
    return df_enc[:X_d.shape[0]], y, df_enc[X_d.shape[0]:]

## extra
models = [LinearRegression(),RandomForestRegressor(),SGDRegressor()]
targets = ["Linear Regression","Random Forest","SGD Regressor"]

## Preprocessing testing - 1

In [None]:
########################################
## OneHot Encoding
########################################

%%time

X_dev_v1 = loadData('Dataset/dev.tsv')
X_eval_v1 = loadData('Dataset/eval.tsv')

X_dev_v1_prep, y1, X_eval_v1_prep = preprocessingV1(X_dev_v1,X_eval_v1)

scores_1 = evaluateModels(models,targets,X_dev_v1_prep,y1)
scores_1.to_csv("scores-v1.csv",sep=",",index=False)

X_dev_v1_prep.shape, y1.shape, X_eval_v1_prep.shape

In [None]:
rf_1 = RandomForestRegressor()
rf_1.fit(X_dev_v1_prep,y1)

y_pred_1 = rf_1.predict(X_eval_v1_prep)
get_csv(list(X_eval_v1.index),y_pred_1,"submission_prep_v1.csv")

## Preprocessing testing - 2

In [None]:
########################################
## OneHot Encoding + removing outliers
########################################

%%time

X_dev_v2 = loadData('Dataset/dev.tsv')
X_eval_v2 = loadData('Dataset/eval.tsv')

X_dev_v2_prep, y2, X_eval_v2_prep = preprocessingV2(X_dev_v2,X_eval_v2)

scores_2= evaluateModels(models,targets,X_dev_v2_prep,y2)
scores_2.to_csv("scores-v2.csv",sep=",",index=False)

X_dev_v2_prep.shape, y2.shape, X_eval_v2_prep.shape

In [None]:
rf_2 = RandomForestRegressor()
rf_2.fit(X_dev_v2_prep,y2)

y_pred_2 = rf_2.predict(X_eval_v2_prep)
get_csv(list(X_eval_v2.index),y_pred_2,"submission_prep_v2.csv")

## Preprocessing testing - 3

In [None]:
########################################
## OneHot Encoding + removing duplicates
########################################

%%time

X_dev_v3 = loadData('Dataset/dev.tsv')
X_eval_v3 = loadData('Dataset/eval.tsv')

X_dev_v3_prep, y3, X_eval_v3_prep = preprocessingV3(X_dev_v3,X_eval_v3)

scores_3 = evaluateModels(models,targets,X_dev_v3_prep,y3)
scores_3.to_csv("scores-v3.csv",sep=",",index=False)

X_dev_v3_prep.shape, y3.shape, X_eval_v3_prep.shape

In [None]:
rf_3 = RandomForestRegressor()
rf_3.fit(X_dev_v3_prep,y3)

y_pred_3 = rf_3.predict(X_eval_v3_prep)
get_csv(list(X_eval_v3.index),y_pred_3,"submission_prep_v3.csv")

## Preprocessing testing - 4

In [None]:
########################################
## OneHot Encoding + 
########################################

X_dev_v4 = loadData('Dataset/dev.tsv')
X_eval_v4 = loadData('Dataset/eval.tsv')

X_dev_v4_prep, y4, X_eval_v4_prep = preprocessingV4(X_dev_v4,X_eval_v4)

scores_4 = evaluateModels(models,targets,X_dev_v4_prep,y4)
scores_4.to_csv("scores-v4.csv",sep=",",index=False)

X_dev_v4_prep.shape, y4.shape, X_eval_v4_prep.shape

In [None]:
rf_4 = RandomForestRegressor()
rf_4.fit(X_dev_v4_prep,y4)

y_pred_4 = rf_4.predict(X_eval_v4_prep)
get_csv(list(X_eval_v4.index),y_pred_4,"submission_prep_v4.csv")

## Preprocessing testing - 5

In [None]:
########################################
## OneHot Encoding of top 80% elements
########################################

X_dev_v5 = loadData('Dataset/dev.tsv')
X_eval_v5 = loadData('Dataset/eval.tsv')

X_dev_v5_prep, y5, X_eval_v5_prep = preprocessingV5(X_dev_v5,X_eval_v5)

scores_5 = evaluateModels(models,targets,X_dev_v5_prep,y5)
scores_5.to_csv("scores-v5.csv",sep=",",index=False)

X_dev_v5_prep.shape, y5.shape, X_eval_v5_prep.shape

In [None]:
rf_5 = RandomForestRegressor()
rf_5.fit(X_dev_v5_prep,y5)

y_pred_5 = rf_5.predict(X_eval_v5_prep)
get_csv(list(X_eval_v5.index),y_pred_5,"submission_prep_v5.csv")

## Graphs

β

In [16]:
result_test_1 = pd.read_csv('Report tools/preprocessing-scores/scores-v1_Naive.csv')
result_test_1["version"] = "α"
result_test_1["model"]

In [14]:
result_test_3 = pd.read_csv('Report tools/preprocessing-scores/scores-v3_NoDupl.csv')
result_test_3["version"] = "𝛾"

In [15]:
result_test_3

Unnamed: 0,Linear Regression,Random Forest,SGD Regressor,version
0,0.602699,0.637474,0.62516,𝛾
1,0.607276,0.640737,0.634599,𝛾
2,0.593194,0.628418,0.614859,𝛾


In [17]:
df = pd.concat([result_test_1,result_test_3])

In [24]:
sns.barplot(x="Linear Regression", y=count, hue="version", data=df)

NameError: name 'count' is not defined