In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn import svm

import scipy

import re
import pandas as pd
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
import spacy

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import ParameterGrid
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

"""
import nltk
nltk.download('stopwords')
"""

"\nimport nltk\nnltk.download('stopwords')\n"

In [2]:
def loadData(directory):
    df = pd.read_csv(directory,sep="\t")
    return df

def get_final_csv(ids, y, filename):
    pd.DataFrame(dict(Id = ids,Predicted = y)).to_csv(filename,sep=",",index=False)
    
def preprocessingV1(X_d,X_e):
    
    X_d = X_d.drop(columns=["region_2","description"])
    X_e = X_e.drop(columns=["region_2","description"])

    #imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    #X_d["country"] = imputer.fit_transform(np.array(X_d["country"]).reshape(-1,1))
    #X_d["province"] = imputer.fit_transform(np.array(X_d["province"]).reshape(-1,1))
    
    X_d.fillna("other", inplace=True)
    X_e.fillna("other", inplace=True)
    
    y = X_d.quality
    X_d = X_d.drop(columns=["quality"])
    
    df = pd.concat([X_d,X_e])
    
    df_enc = pd.get_dummies(df)
    df_enc_scipy = scipy.sparse.csr_matrix(df_enc.values)
    
    return df_enc_scipy[:X_d.shape[0]], y, df_enc_scipy[X_d.shape[0]:]

In [4]:
X_d = loadData('Dataset/dev.tsv')

X_train = X_d[:56000].copy()
X_test = X_d[56000:70000].copy()

y_train = X_train.quality
y_test = X_test.quality

X_test.drop(columns=["quality"],inplace=True)

description_train = X_train[["description"]]
description_test = X_test[["description"]]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((56000, 9), (56000,), (14000, 8), (14000,))

In [5]:
X_d_prep, y, X_e_prep = preprocessingV1(X_train,X_test)

In [6]:
#######################
## naive
#######################

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [9]:
lr = LinearRegression()
lr.fit(X_d_prep,y_train)

y_pred = lr.predict(X_e_prep)
r2_score(y_test,y_pred)

0.7181166491338944

## Sentiment

In [10]:
## SENTIMENT

stemmer = SnowballStemmer('english')

punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]
other_sw = ['anywh', 'el', 'elsewh', 'everywh', 'ind', 'otherwi', 'plea', 'somewh','abov', 
            'afterward', 'alon', 'alreadi', 'alway', 'ani', 'anoth', 'anyon', 'anyth', 
            'anywher', 'becam', 'becaus', 'becom', 'befor', 'besid', 'cri', 'describ', 
            'dure', 'els', 'elsewher', 'empti', 'everi', 'everyon', 'everyth', 'everywher', 
            'fifti', 'forti', 'henc', 'hereaft', 'herebi', 'howev', 'hundr', 'inde', 'mani', 
            'meanwhil', 'moreov', 'nobodi', 'noon', 'noth', 'nowher', 'onc', 'onli', 'otherwis', 
            'ourselv', 'perhap', 'pleas', 'sever', 'sinc', 'sincer', 'sixti', 'someon', 'someth', 
            'sometim', 'somewher', 'themselv', 'thenc', 'thereaft', 'therebi', 'therefor', 'togeth', 
            'twelv', 'twenti', 'veri', 'whatev', 'whenc', 'whenev', 'wherea', 'whereaft', 'wherebi', 
            'wherev', 'whi', 'yourselv']

stop_words = text.ENGLISH_STOP_WORDS.union(punc).union(other_sw)



def clean_text(text):
    output_str = re.sub('[^A-Za-zàèéìòù]+', ' ', text)
    output_str_lower =re.sub('[A-Z]+', lambda m: m.group(0).lower(), output_str)
    output_noss = ' '.join(w for w in output_str_lower.split() if w not in stop_words) 
    output_stemming = [stemmer.stem(i) for i in output_noss.split()]
    output_stemming = ' '.join(output_stemming)
    return output_stemming

## get sentiment
import numpy as np
def getSentiment(X):
    mean = np.mean(X["quality"])
    X["sentiment"] = X["quality"].apply(lambda x : 0 if x < mean else 1)
    return X

In [16]:
description_train = X_train[["description"]].copy()
description_test = X_test[["description"]].copy()

description_train['description_cleaned'] = description_train['description'].apply(lambda x: clean_text(x)) 
description_test['description_cleaned'] = description_test['description'].apply(lambda x: clean_text(x)) 

description_train.drop(columns=["description"],inplace=True)
description_test.drop(columns=["description"],inplace=True)

In [18]:
params = {
  "min_df": [3, 4], 
  "max_df": [0.5],
  "ngram_range": [(1,3)]
}

for config in ParameterGrid(params):
    pipe = Pipeline([('count', CountVectorizer(**config,  token_pattern=r'\b[^\d\W]+\b' )), ('tfid', TfidfTransformer())])
    vectors = pipe.fit_transform(description_train.description_cleaned)

In [19]:
vectors.shape

(56000, 91247)

In [20]:
ev_x = pipe.transform(description_test.description_cleaned) 

In [21]:
ev_x.shape

(14000, 91247)

In [23]:
from scipy.sparse import hstack

train_supremo = hstack((X_d_prep, vectors))
test_supremo = hstack((X_e_prep, ev_x))

In [24]:
train_supremo.shape, test_supremo.shape

((56000, 126545), (14000, 126545))