In [None]:
from scipy.sparse import hstack
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression,SGDRegressor, Ridge
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, cross_val_score
import scipy

import re
import pandas as pd
from nltk.corpus import stopwords
import spacy

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import text

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

import nltk
nltk.download('wordnet')
nltk.download('stopwods')
nltk.download('punkt')

In [None]:
def loadData(directory):
    df = pd.read_csv(directory,sep="\t")
    return df

def get_final_csv(ids, y, filename):
    pd.DataFrame(dict(Id = ids,Predicted = y)).to_csv(filename,sep=",",index=False)

    
models = [LinearRegression(),Ridge(), SGDRegressor()]
targets = ["LinearRegression","Ridge","SGDRegressor"]
def evaluateModels(models, targets,X,y):
    
    scores = pd.DataFrame()
    for model,target in zip(models,targets):
        scores[target] = cross_val_score(model, X, y, scoring='r2', cv=3, n_jobs=-1)
        
    return scores

def preprocessing(X_d,X_e):
    
    # region_2 has too many duplicates
    # description will be manipulated separately
    X_d = X_d.drop(columns=["region_2","description"])
    X_e = X_e.drop(columns=["region_2","description"])

    # impute country and province with the most common values
    imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    X_d["country"] = imputer.fit_transform(np.array(X_d["country"]).reshape(-1,1))
    X_d["province"] = imputer.fit_transform(np.array(X_d["province"]).reshape(-1,1))
    
    # fill designation with the sentinel value "other"
    X_d = X_d.fillna("other")
    X_e = X_e.fillna("other")
    
    y = X_d.quality
    X_d = X_d.drop(columns=["quality"])
    
    # concat dev and eval for the encoding 
    df = pd.concat([X_d,X_e])
    
    # encode and use the sparse matrix because pandas' df is too heavy
    df_enc = pd.get_dummies(df)
    df_enc_scipy = scipy.sparse.csr_matrix(df_enc.values)
    
    # split and return the encoded values
    return df_enc_scipy[:X_d.shape[0]], y, df_enc_scipy[X_d.shape[0]:]

from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords as sw

# general structure learnt from Lab10
class LemmaTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        
    def __call__(self, document):
        lemmas = []
        
        for t in word_tokenize(document):
            
            # remove external spaces
            t = t.strip()
            # lowercase
            t = t.lower()
            # remove punctuation
            t = re.sub(r'[^\w\s]','',t)
            # remove numbers
            t = re.sub(r'[\d+]','',t)
            
            lemma = self.lemmatizer.lemmatize(t)
            if len(lemma) > 2:
                lemmas.append(lemma)
    
        return lemmas

def preprocessText(description_train,description_test):
    
    vectorizer = TfidfVectorizer(ngram_range=(1,4), tokenizer=LemmaTokenizer(), stop_words="english")
    
    # fit the TfidfVectorizer on the development set and transform it on both dev and eval
    dev_vec = vectorizer.fit_transform(description_train.description)
    eval_vec = vectorizer.transform(description_test.description)

    return dev_vec, eval_vec

In [None]:
# drop duplicates
X_dev = X_dev.drop_duplicates()

# encode the categorical features and handle np.nan
X_dev_prep, y, X_eval_prep = preprocessing(X_dev,X_eval)

# preprocess the descriptions
dev_vec, eval_vec = preprocessText(X_dev[["description"]].copy(),
                                   X_eval[["description"]].copy())

# concat the encoded df and the tf-idf
X_conc_dev = hstack((X_dev_prep, dev_vec))
X_conc_eval = hstack((X_eval_prep, eval_vec))

model = Ridge(alpha=0.01)
model.fit(X_conc_dev,y)
y_pred = model.predict(X_conc_eval)

get_final_csv(list(X_eval.index),y_pred,"submit.csv")