## Import and Loading

In [64]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

import pandas as pd
import numpy as np

from sklearn import datasets

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import LabelEncoder, StandardScaler

import category_encoders
import scipy
import time

In [65]:
def loadData(directory):
    df = pd.read_csv(directory,sep="\t")
    return df

def preprocessingV4(X_d,X_e,labels):
    
    X_d = X_d.fillna("other")
    X_e = X_e.fillna("other")
    
    df = pd.concat([X_d,X_e])
    df_enc = pd.get_dummies(df)
    df_enc_scipy = scipy.sparse.csr_matrix(df_enc.values)
    
    return df_enc_scipy[:X_d.shape[0]],df_enc_scipy[X_d.shape[0]:]

def evaluateModels(models, targets,X,y):
    
    scores = pd.DataFrame()
    
    for model,target in zip(models,targets):
        cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=42)
        scores[target] = cross_val_score(model, X, y, scoring='r2', cv=cv, n_jobs=-1)
        
        print(f"{target} has been concluded")
        
    return scores

In [66]:
X_dev = loadData('Dataset/dev.tsv')
X_eval = loadData('Dataset/eval.tsv')

In [67]:
min_t = X_dev["quality"].quantile(0.025)
max_t = X_dev["quality"].quantile(0.975)

min_t, max_t

(24.0, 69.0)

## Preprocessing selection


In [68]:
%%time

X_dev = loadData('Dataset/dev.tsv')
X_eval = loadData('Dataset/eval.tsv')

X_dev = X_dev.drop(columns=["description","region_2"])
X_eval = X_eval.drop(columns=["description","region_2"])

X_dev_filtered = X_dev[X_dev["quality"] >= min_t]
X_dev_filtered = X_dev_filtered[X_dev_filtered["quality"] <= max_t]

Wall time: 666 ms


In [69]:
X_dev_filtered.shape, X_dev.shape

((115772, 7), (120744, 7))

In [70]:
X_dev_filtered = X_dev_filtered.fillna("other")
X_eval = X_eval.fillna("other")
df = pd.concat([X_dev_filtered,X_eval])

In [71]:
%%time

df_enc = pd.get_dummies(df)

MemoryError: Unable to allocate 4.07 GiB for an array with shape (145958, 29956) and data type uint8

In [51]:
df_enc.shape

(145958, 46954)

In [55]:
df_enc_scipy = scipy.sparse.csr_matrix(df_enc.values)

MemoryError: Unable to allocate 6.38 GiB for an array with shape (46953, 145958) and data type uint8

In [None]:
df_enc_scipy[:X_d.shape[0]],df_enc_scipy[X_d.shape[0]:]

## Hyperparameter tuning

In [23]:
def doGridSearch(model,model_name,hyperparams,X,y):
    gs = GridSearchCV(estimator=model,  
                         param_grid=hyperparams,
                         scoring='r2',
                         cv=3,
                         n_jobs=4,
                         verbose=True)

    gs.fit(X, y)
    return gs

In [24]:
hyperparams_RF = {
    "n_estimators": [300],
    #"criterion": ["mse", "mae"],
    "max_features": ["auto"],
    "random_state": [42],# always use the samet random seed
    "n_jobs": [4],# for parallelization
}

gs = doGridSearch(RandomForestRegressor(verbose=True), "RandomForestRegressor",hyperparams_RF,X_dev_prep,y)
print(f"Best params:\t{gs.best_params_}")
print(f"Best score:\t{gs.best_score_}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed: 26.7min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  9.6min


Best params:	{'max_features': 'auto', 'n_estimators': 300, 'n_jobs': 4, 'random_state': 42}
Best score:	0.7134175554757253


[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed: 15.0min finished


In [6]:
rf = RandomForestRegressor(n_jobs=-1)
rf.fit(X_dev_prep,y)

RandomForestRegressor(n_jobs=-1)

## Prediction

In [7]:
def get_csv(ids, y):
    pd.DataFrame(dict(Id = ids,Predicted = y)).to_csv("submission20.csv",sep=",",index=False)

y_pred = rf.predict(X_eval_prep)
get_csv(list(X_eval.index),y_pred)

## Submissions

### Submission 16 : 0.774 
* fit the StandardScaler on the development set
* y = winsorize(X_dev["quality"], limits=[0.1, 0.1])

### Submission 17 : 0.753
* sparse matrix with getDummies 
* with duplicates
* 80% of each categorical elements
* RandomForestRegressor : {'max_features': 'auto', 'n_estimators': 300, 'n_jobs': 4, 'random_state': 42}
* quality from 0 to 100
* without description, designation and region_2

### Submission 18 : 0.820
* sparse matrix with getDummies 
* with duplicates
* 50% of each categorical elements
* RandomForestRegressor (naive)
* quality from 0 to 100
* without description, and region_2

### Submission 19 : 0.832
* sparse matrix with getDummies 
* with duplicates
* 85% of each categorical elements
* RandomForestRegressor (naive)
* quality from 0 to 100
* without description, and region_2

### Submission 20 : 0.839
* sparse matrix with getDummies 
* with duplicates
* 100% of each categorical elements
* RandomForestRegressor (naive)
* quality from 0 to 100
* without description, and region_2

In [5]:
X_d = loadData('Dataset/dev.tsv')
X_e = loadData('Dataset/eval.tsv')

X_d = X_d.fillna("other")
X_e = X_e.fillna("other")

X_d = X_d.drop(columns=['description','region_2','quality'])
X_e = X_e.drop(columns=['description','region_2'])

df = pd.concat([X_d,X_e])

In [6]:
df_enc = pd.get_dummies(df)

In [7]:
df_enc.shape

(150930, 47806)

In [8]:
df_enc_scipy = scipy.sparse.csr_matrix(pd.get_dummies(df).values)

In [10]:
df_enc_scipy.shape

(150930, 47806)

In [11]:
dev = df_enc_scipy[:X_d.shape[0]]
eva = df_enc_scipy[X_d.shape[0]:]

In [12]:
dev.shape

(120744, 47806)

In [13]:
eva.shape

(30186, 47806)

In [27]:
df_enc_scipy

<150930x47806 sparse matrix of type '<class 'numpy.uint8'>'
	with 905580 stored elements in Compressed Sparse Row format>