## Import

In [None]:
from scipy.sparse import hstack
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
import scipy

import re
import pandas as pd
from nltk.corpus import stopwords
import spacy

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import text

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")


import nltk
nltk.download('stopwords')

## Methods

In [None]:
def loadData(directory):
    df = pd.read_csv(directory,sep="\t")
    return df

def get_final_csv(ids, y, filename):
    pd.DataFrame(dict(Id = ids,Predicted = y)).to_csv(filename,sep=",",index=False)
    
def evaluateModels(models, targets,X,y):
    
    scores = pd.DataFrame()
    for model,target in zip(models,targets):
        scores[target] = cross_val_score(model, X, y, scoring='r2', cv=3, n_jobs=-1)
        
    return scores

##################################  
## Encoding and Missing values
##################################
def preprocessing(X_d,X_e):
    
    X_d = X_d.drop(columns=["region_2","description"])
    X_e = X_e.drop(columns=["region_2","description"])

    imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    X_d["country"] = imputer.fit_transform(np.array(X_d["country"]).reshape(-1,1))
    X_d["province"] = imputer.fit_transform(np.array(X_d["province"]).reshape(-1,1))
    
    X_d = X_d.fillna("other")
    X_e = X_e.fillna("other")
    
    y = X_d.quality
    X_d = X_d.drop(columns=["quality"])
    
    df = pd.concat([X_d,X_e])
    
    df_enc = pd.get_dummies(df)
    df_enc_scipy = scipy.sparse.csr_matrix(df_enc.values)
    
    return df_enc_scipy[:X_d.shape[0]], y, df_enc_scipy[X_d.shape[0]:]

In [None]:
##################################  
## Document preprocessing
##################################

stemmer = SnowballStemmer('english')

punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]
other_sw = ['anywh', 'el', 'elsewh', 'everywh', 'ind', 'otherwi', 'plea', 'somewh','abov', 
            'afterward', 'alon', 'alreadi', 'alway', 'ani', 'anoth', 'anyon', 'anyth', 
            'anywher', 'becam', 'becaus', 'becom', 'befor', 'besid', 'cri', 'describ', 
            'dure', 'els', 'elsewher', 'empti', 'everi', 'everyon', 'everyth', 'everywher', 
            'fifti', 'forti', 'henc', 'hereaft', 'herebi', 'howev', 'hundr', 'inde', 'mani', 
            'meanwhil', 'moreov', 'nobodi', 'noon', 'noth', 'nowher', 'onc', 'onli', 'otherwis', 
            'ourselv', 'perhap', 'pleas', 'sever', 'sinc', 'sincer', 'sixti', 'someon', 'someth', 
            'sometim', 'somewher', 'themselv', 'thenc', 'thereaft', 'therebi', 'therefor', 'togeth', 
            'twelv', 'twenti', 'veri', 'whatev', 'whenc', 'whenev', 'wherea', 'whereaft', 'wherebi', 
            'wherev', 'whi', 'yourselv']

stop_words = text.ENGLISH_STOP_WORDS.union(punc).union(other_sw)


def cleanDescriptions(text):
    t = re.sub('[^A-Za-zàèéìòù]+', ' ', text)
    t = re.sub('[A-Z]+', lambda m: m.group(0).lower(), t)
    t = ' '.join(w for w in t.split() if w not in stop_words) 
    t = [stemmer.stem(i) for i in t.split()]
    t = ' '.join(output_stemming)
    return output_stemming

def preprocessText(description_train,description_test):

  # extract the descriptions
  description_train = X_dev[["description"]].copy()
  description_test = X_eval[["description"]].copy()

  # clean the descriptions
  description_train['description_cleaned'] = description_train['description'].apply(lambda x: preprocessText(x)) 
  description_test['description_cleaned'] = description_test['description'].apply(lambda x: preprocessText(x)) 
  description_train.drop(columns=["description"],inplace=True)
  description_test.drop(columns=["description"],inplace=True)

  ## POI ELIMINARE - SOLO PER REPORT
  params = {
    "min_df": [1], 
    "max_df": [0.4],
    "ngram_range": [(1,4)]
  }

  for config in ParameterGrid(params):
      pipe = Pipeline([
                ('count', CountVectorizer(**config,  token_pattern=r'\b[^\d\W]+\b' )), 
                ('tfid', TfidfTransformer())
      ])
      
      dev_vec = pipe.fit_transform(description_train.description_cleaned)

  eval_vec = pipe.transform(description_test.description_cleaned)

  return dev_vec, eval_vec


## Load data

In [None]:
# load datasets
X_dev = loadData('dev.tsv')
X_eval = loadData('eval.tsv')

# drop duplicates
X_dev = X_dev.drop_duplicates()

## Preprocessing α 

In [None]:
####################################################
## Preprocessing - removing the outliers : α
####################################################

# 1.5(IQR) Rule for detecting the outliers thresholds
t = X_dev["quality"].quantile(0.75) - X_dev["quality"].quantile(0.25)
min_t = X_dev["quality"].quantile(0.25) - 1.5 * t
max_t = X_dev["quality"].quantile(0.75) + 1.5 * t

# filter
X_d_filtered = X_dev[X_dev["quality"] >= min_t]
X_d_filtered = X_d_filtered[X_d_filtered["quality"] <= max_t]

# encode and handle np.nan
X_dev_prep_f, y_f, X_eval_prep_f = preprocessing(X_d_filtered,X_eval)

# preprocess the descriptions
dev_vec_f, eval_vec_f = preprocessText(X_d_filtered[["description"]].copy(),
                                   X_eval[["description"]].copy())

# concat the encoded df and the tf-idf
X_conc_dev_f = hstack((X_dev_prep, dev_vec))
X_conc_eval_f = hstack((X_eval_prep, eval_vec))

## Preprocessing β

In [None]:
%%time

####################################################
## Preprocessing - without removing the outliers : β
####################################################

# encode and handle np.nan
X_dev_prep, y, X_eval_prep = preprocessing(X_dev,X_eval)

# preprocess the descriptions
dev_vec, eval_vec = preprocessText(X_dev[["description"]].copy(),
                                   X_eval[["description"]].copy())

# concat the encoded df and the tf-idf
X_conc_dev = hstack((X_dev_prep, dev_vec))
X_conc_eval = hstack((X_eval_prep, eval_vec))

## Model evaluation

In [None]:
%%time

####################################################
## Evaliuate : α
####################################################

fig, ax = plt.subplots(figsize=(5, 5))

models = [LinearRegression(),SGDRegressor()]
targets = ["LinearRegression","SGDRegressor"]

scores_f = evaluateModels(models,targets,X_conc_dev_f,y_f)
sns.boxplot(x="variable", y="value", data=pd.melt(scores), ax=ax,palette="OrRd_r")
plt.xlabel("Model")
plt.ylabel("R2_Score")

In [None]:
####################################################
## Evaluate : β
####################################################

fig, ax = plt.subplots(figsize=(5, 5))

models = [LinearRegression(),,SGDRegressor()]
targets = ["LinearRegression","SGDRegressor"]

scores = evaluateModels(models,targets,X_conc_dev,y)
sns.boxplot(x="variable", y="value", data=pd.melt(scores), ax=ax,palette="OrRd_r")
plt.xlabel("Model")
plt.ylabel("R2_Score")

## Hyperparameter tuning

In [None]:
def doGridSearch(model,hyperparams,X,y):
    
    gs = GridSearchCV(estimator=model,  
                         param_grid=hyperparams,
                         scoring='r2',
                         cv=3,
                         n_jobs=4,
                         verbose=True)

    gs.fit(X, y)
    
    return gs

In [None]:
####################################################
## Grid search linear regression
####################################################

hyperparams_LR = {
    'fit_intercept' : [True,False],
    'normalize' : [True,False]
}

gs_lr = doGridSearch(LinearRegression(), "LinearRegression",hyperparams_LR,train_supremo,y)

print(f"Best params:\t{gs_lr.best_params_}")
print(f"Best score:\t{gs_lr.best_score_}")

y_pred_lr = gs_lr.predict(test_supremo)
get_final_csv(list(X_eval.index),y_pred_lr,"submit-linear-regression.csv")

In [None]:
####################################################
## Grid search SGD Regressor
####################################################

hyperparams_SGD = {
    'loss' : ['squared_loss','huber'],
    'penalty' : ['l1','l2',None],
    'alpha' : np.logspace(-5, 0, 6),
    'eta0' : [0.01, 0.1]
}

gs_sgd = doGridSearch(SGDRegressor(max_iter=10000), "SGDRegressor",hyperparams_SGD,train_supremo,y)
print(f"Best params:\t{gs_sgd.best_params_}")
print(f"Best score:\t{gs_sgd.best_score_}")

y_pred_sgd = gs_sgd.predict(test_supremo)
get_final_csv(list(X_eval.index),y_pred_sgd,"submit-sgd-regressor.csv")

## Final prediction 

In [None]:
## model used for the final score