In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler,MaxAbsScaler
from sklearn.metrics import mean_squared_error


In [None]:

import p5_util
def tfidf_vectorizer(X_, vectorizer=None, scaler=None):
    if vectorizer is None :
        # When building vacabulary, terms with frequency document < p_min_df are ignored.
        p_min_df = 0.001

        # When building vacabulary, terms with frequency document> p_max_df are ignored.
        p_max_df = 1.

        print("MIN DF= "+str(p_min_df)+"  MAX DF = "+str(p_max_df))
        ngram1=1
        ngram2=1
        vectorizer=TfidfVectorizer(norm="l2", use_idf=True, min_df=p_min_df, max_df=p_max_df, ngram_range=(ngram1, ngram2))
    if scaler is None :
        scaler = MaxAbsScaler()

    csr_matrix_tfidf_ngram1 = vectorizer.fit_transform(X_)

    print(csr_matrix_tfidf_ngram1.shape)
    
    X_scaled = scaler.fit_transform(csr_matrix_tfidf_ngram1)
    return X_scaled, vectorizer, scaler

#### Read data from files

In [None]:
import pandas as pd
df_test = pd.read_csv('./data/test.csv.zip', compression='zip', header=0, sep=',', quotechar='"')
df_train = pd.read_csv('./data/train.csv.zip', compression='zip', header=0, sep=',', quotechar='"')

#### Replace Nan values depending column type

In [None]:
df_train.head()

In [None]:
df_train.columns

#### Replace all nan values with 0.

In [None]:
import numpy as np

for col in df_train.columns:
    nan_rows = df_train[df_train[col].isnull()][col]
    if nan_rows.shape[0] > 0 :
        print(col)
        df_train[col] = df_train[col].replace('nan', np.nan).fillna(0)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =train_test_split(df_train['comment_text'],df_train['target'],test_size=0.33, random_state=42)


# <font color=blue> Standardization</font>

Standardization is applied in a such way: 
* Verbs from sentences are removed
* Stop words are removed
* Lemmatization is applied
* English stemming is applied

In [None]:
import p5_util
import p6_util
import p6_util_plot

file_name='./data/ser_train_std.dump'

if False :
    ser_train_std = p6_util.p6_df_standardization(X_train\
                                                  , is_sentence_filter=False\
                                                  , is_stemming=True\
                                                  , is_lem=True
                                                  , list_to_keep=list()
                                                  , is_lxml = False)
    p5_util.object_dump(ser_train_std, file_name)
else :
    ser_train_std = p5_util.object_load(file_name)

In [None]:
ser_train_std.head()
print(ser_train_std.iloc[0])

In [3]:
import p5_util
extension='_full'
extension=''

filename = './data/X_train_token'+extension+'.dump'
X_train_token = p5_util.object_load(filename)


filename = './data/X_test_token'+extension+'.dump'
X_test_token = p5_util.object_load(filename)

filename = './data/y_train_token'+extension+'.dump'
y_train_token = p5_util.object_load(filename)


filename = './data/y_test_token'+extension+'.dump'
y_test_token = p5_util.object_load(filename)

print("\nX_train_encoded shape = {}".format(len(X_train_token)))
print("X_test_encoded shape  = {}".format(len(X_test_token)))
print("Y train shape= {}".format(len(y_train_token)))
print("Y test shape= {}".format(len(y_test_token)))


p5_util.object_load : fileName= ./data/X_train_token.dump
p5_util.object_load : fileName= ./data/X_test_token.dump
p5_util.object_load : fileName= ./data/y_train_token.dump
p5_util.object_load : fileName= ./data/y_test_token.dump

X_train_encoded shape = 362779
X_test_encoded shape  = 178682
Y train shape= 362779
Y test shape= 178682


In [4]:
list_X_train = [" ".join(list_token) for list_token in X_train_token]
list_X_test = [" ".join(list_token) for list_token in X_test_token]

# <font color=blue>Linear Regression </font>

## <font color=blue>Building an estimator pipeline </font>

* Estimator pipeline is built with TF-IDF BOW.
* Standard scaling is applied in order mean for features to be 0.

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler

pipeline_lr = Pipeline([
    ('vect', CountVectorizer()),
    #('tfidf',  TfidfTransformer()),
    ('scale', StandardScaler(with_mean=False)),
    ('lr', LinearRegression()),
])


In [7]:
import numpy as np

model_lr = pipeline_lr.fit(np.array(list_X_train), np.array(y_train_token))



In [8]:
y_predict = model_lr.predict(list_X_test)



In [21]:
from sklearn.metrics import r2_score
r2_score(np.round(y_test_token,1), np.round(y_predict,1))

0.007290387463792958

In [24]:
import sklearn

sklearn.metrics.mean_squared_error(np.round(y_test_token,1), np.round(y_predict,1))

0.038980647183264126

In [28]:
dir(pipeline_lr)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_cache',
 '_abc_negative_cache',
 '_abc_negative_cache_version',
 '_abc_registry',
 '_estimator_type',
 '_final_estimator',
 '_fit',
 '_get_param_names',
 '_get_params',
 '_inverse_transform',
 '_pairwise',
 '_replace_estimator',
 '_set_params',
 '_transform',
 '_validate_names',
 '_validate_steps',
 'classes_',
 'decision_function',
 'fit',
 'fit_predict',
 'fit_transform',
 'get_params',
 'inverse_transform',
 'memory',
 'named_steps',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'score',
 'set_params',
 'steps',
 'transform']

In [33]:
pipeline_lr.named_steps['lr'].coef_.shape

(128879,)

In [20]:
print(np.round(y_predict[100:110],1))
print(np.round(y_test_token[100:110],1))

[ 0.1  0.   0.  -0.   0.1  0.2  0.1  0.5  0.   0.1]
[0.  0.  0.  0.  0.  0.  0.  0.2 0.2 0. ]


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test, y_predict))
print("RMSE for Linear Regression: %.4f" % rmse)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor

pipeline_gbr = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('scale', preprocessing.StandardScaler(with_mean=False)),
    ('gbr', GradientBoostingRegressor()),
])


In [None]:
model = pipeline_gbr.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score

y_predict = model.predict(X_test)
r2_score(y_test_token, y_predict)

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_predict)
print("MSE: %.4f" % mse)
np.sqrt(mse)

In [None]:
X_scaled.min(),X_scaled.max()

In [None]:
gbr = GradientBoostingRegressor()
X_scaled, scaler = tfidf_vectorizer(X_train)
model = gbr.fit(X_scaled,y_train)

In [None]:
from sklearn.metrics import mean_squared_error
X_test_scaled = tfidf_vectorizer(X_test)

In [None]:
y_predict = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_predict)
print("MSE: %.4f" % mse)
np.sqrt(mse)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb

pipeline_xgbr = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('scale', preprocessing.StandardScaler(with_mean=False)),
    ('xgbr', xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)),
])


In [None]:
model_xgbr = pipeline_xgbr.fit(X_train, y_train)

y_preds = model_xgbr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_preds))
print("RMSE: %f" % (rmse))


## XGBOOST with TFIDF NGRAM=(2,2)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import preprocessing
import xgboost as xgb


ngram1=2
ngram2=2
p_min_df = 0.01
# When building vacabulary, terms with frequency document> p_max_df are ignored.
p_max_df = 1.

vectorizer = TfidfVectorizer(norm="l2", use_idf=True, min_df=p_min_df, max_df=p_max_df, ngram_range=(ngram1, ngram2))
pipeline_xgbr = Pipeline([
    ('tfidf',  vectorizer),
    ('scale', preprocessing.StandardScaler(with_mean=False)),
    ('xgbr', xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)),
])


In [None]:
model_xgbr = pipeline_xgbr.fit(X_train, y_train)

y_preds = model_xgbr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_preds))
print("RMSE: %f" % (rmse))

## XGBOOST with Standardized Xtrain

In [None]:
file_name='./data/ser_train_std.dump'
ser_train = p5_util.object_load(file_name)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import preprocessing
import xgboost as xgb


ngram1=1
ngram2=1
p_min_df = 0.
# When building vacabulary, terms with frequency document> p_max_df are ignored.
p_max_df = 1.

vectorizer = TfidfVectorizer(norm="l2", use_idf=True, min_df=p_min_df, max_df=p_max_df, ngram_range=(ngram1, ngram2))
pipeline_xgbr = Pipeline([
    ('tfidf',  vectorizer),
    ('scale', preprocessing.StandardScaler(with_mean=False)),
    ('xgbr', xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.01,
                max_depth = 5, alpha = 10, n_estimators = 100, nthread=-1)),
])


In [None]:
model_xgbr=pipeline_xgbr.fit(ser_train.values, y_train)

In [None]:
y_preds = model_xgbr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_preds))
print("RMSE: %f" % (rmse))

In [None]:
help(vectorizer)