In [4]:
%load_ext autoreload
%autoreload 2

%reload_ext autoreload

In [5]:
import gensim
import pandas as pd
import numpy as np
is_w2vec_saved = False

In [6]:
import os
from gensim.models import Word2Vec

def get_trained_w2vec(model_filename, X, epoch):
    #-------------------------------------------------------------
    # Check if a model with same name still exists
    #-------------------------------------------------------------
    if model_filename.split('/')[-1] in os.listdir('./data'):
        is_w2vec_saved = True
    else :
        is_w2vec_saved = False

    if is_w2vec_saved is False :        
        #-------------------------------------------------------------------------------------
        # Some words are obfuscated. Then they may appear once in all corpus.
        # In order to avoid this, then min_count is fixed to 1.
        # Window is fixed to 4 : 2 words before and 2 words after central word.
        #-------------------------------------------------------------------------------------
        print("Training W2VEC model...")
        model_w2vec = Word2Vec(min_count=2, workers=6)
        model_w2vec.build_vocab(X)  # prepare the model vocabulary
        model_w2vec.train(X, total_examples=model_w2vec.corpus_count, epochs=epochs) 
        print("Done!\n")
        model_w2vec.save(model_filename)
        print("Model saved!\n")
    else :
        print("Loading W2VEC model...")
        model_w2vec = Word2Vec.load(model_filename)
        print("Done!\n")
    return model_w2vec
    

In [7]:
import pandas as pd
def clean_X_label(X, label) :
    
    ser_ = pd.Series(X)

    list_index = [i for i in ser_.index if len(ser_[i])==0]
    
    ser_.drop(list_index, inplace=True)
    list_to_clean_1 = ser_.tolist()
    
    print("Cleaned empty text = {}".format(len(list_index)))
    ser_ = pd.Series(label)
    ser_.drop(list_index, inplace=True)

    list_to_clean_2 = ser_.tolist()
    
    return list_to_clean_1, list_to_clean_2


In [8]:
import numpy as np
def build_embedding_matrix(w2vec, list_X):
    dim = w2vec.wv.vectors.view().shape[1]
    matrix = np.zeros(dim)
    zero_vec = np.zeros(dim)

    # Words from that are not in vocabulary are replaced with a zero vector.
    # Empty text, mean `list_corpus[i]` is empty is replaced with a zero vector.

    for i in range(0, len(list_X),1) :
        #---------------------------------------------------------------------------------------
        # Text vectorization : mean of words components for text components
        # In case word does not belongs to vacabulatory, then zero vector replace it.
        #---------------------------------------------------------------------------------------
        arr1 = np.mean( [w2vec.wv[word] for word in list_X[i]  if word in w2vec.wv.vocab], axis=0 )

        #---------------------------------------------------------------------------------------
        # Check empty text (empty list of words) and replace it with ezro vector, when required
        #---------------------------------------------------------------------------------------
        if 0 == len(arr1.shape) :
            print(i)
            arr1 = zero_vec
        else :
            pass

        #---------------------------------------------------------------------------------------
        # Matrix of corpus is computed.
        # Each raw is a text from corpus while each column is a feature of the corpus.
        #---------------------------------------------------------------------------------------
        matrix = np.vstack((matrix,arr1))
    #--------------------------------------------------------
    # Remove first raw from matrix
    #--------------------------------------------------------
    matrix = matrix[1:]
    return matrix

In [32]:
def get_sample(ser_X, ser_y, ratio=-1) :
    sample_length = int(len(ser_X)*ratio)
    return ser_X.sample(sample_length).values,ser_y.sample(sample_length).values

In [10]:
class MeanW2VecEmbedding():
    def __init__(self, w2vec) :
        self._w2vec = w2vec
        
    def fit(self, X, y):
        
        return self
    
    def transform(self,X,y):
        range_index = range(0, len(X),1)
        list_X_y = [ (gensim.utils.simple_preprocess(X[index], deacc=False, min_len=2), y[index]) for index in range_index]
        list_X = [list_X_y[i][0] for i in range(0, len(list_X_y),1)]
        list_y = [list_X_y[i][1] for i in range(0, len(list_X_y),1)]
        list_X, list_y = clean_X_label(list_X, list_y)
        X = build_embedding_matrix(self._w2vec, list_X)
        return X, np.array(list_y)
    

# <font color=blus>1. Loading data</font>

All text is lowered.

In [11]:
df_test = pd.read_csv('./data/test.csv.zip', compression='zip', header=0, sep=',', quotechar='"')
df_train = pd.read_csv('./data/train.csv.zip', compression='zip', header=0, sep=',', quotechar='"')

df_train['comment_text'] = df_train['comment_text'].apply(lambda x : x.lower())

## <font color=blus>1.1 Data is splitted into Train and Test datasets</font>

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_train['comment_text'],df_train['target'],test_size=0.33, random_state=42)

print("Train dataset: X = {} Label= {}".format(X_train.shape, y_train.shape))
print("Test dataset: X = {} Label= {}".format(X_test.shape, y_test.shape))

#X_train = X_train[:10000]
#y_train = y_train[:10000]

Train dataset: X = (1209265,) Label= (1209265,)
Test dataset: X = (595609,) Label= (595609,)


#### For testing models, a ratio of train and test datasets are used.

When value is -1, then the whole dataset is used.

In [33]:
ratio=0.3

X_train, y_train  = get_sample(X_train, y_train, ratio=ratio)
X_train.shape, y_train.shape

((362779,), (362779,))

#### Same ratio is used for test dataset.

In [60]:
X_test, y_test  = get_sample(X_test, y_test, ratio=ratio)
X_test.shape, y_test.shape

((178682,), (178682,))

#### Build a corpus sample from train dataset

All texts from corpus are tokenized.

Too short tokens (< 2 characters) and too long tokens (> 15 characters) are igonred.

In [None]:
list_train_corpus_label = [ (gensim.utils.simple_preprocess(X_train[index], deacc=False, min_len=2), y_train[index]) for index in X_train.index]

X_train = [list_train_corpus_label[i][0] for i in range(0, len(list_train_corpus_label),1)]
y_train = [list_train_corpus_label[i][1] for i in range(0, len(list_train_corpus_label),1)]

#### Build a corpus sample from test dataset

In [61]:
list_test_corpus_label = [ (gensim.utils.simple_preprocess(X_test[index], deacc=False, min_len=2), y_test[index]) for index in X_test.index]

X_test = [list_test_corpus_label[i][0] for i in range(0, len(list_test_corpus_label),1)]
y_test = [list_test_corpus_label[i][1] for i in range(0, len(list_test_corpus_label),1)]

#### Corpus is cleaned.

Some text are empty, due to previous pre-processing. They are then removed.
Corresponding y value are also removed, based on indexes values.


In [33]:
X_train, y_train = clean_X_label(X_train, y_train)

Cleaned empty text = 313


In [62]:
X_test, y_test = clean_X_label(X_test, X_test)

Cleaned empty text = 161


# <font color=blue>2.Words embeddings with W2VEC</font>

W2VEC model is trained then saved.


In [41]:
epochs=10
model_filename = './data/model_w2vec_'+str(epochs)+'_epochs'

w2vec = get_trained_w2vec(model_filename, X_train, epochs)

Training W2VEC model...
Done!

Model saved!



#### X_train is transformed into a matrix thanks to W2VEC.

In W2VEC, each word is represented as a vector. 

In this context, a text composed from words may be represented 
as a linear combination of vectors of words.

By default, w2vec model is build as Countinuous Bag of Word (CBOW). This means that by default, the model will be able to predict a word from a given context.


The result is a matrix with : 
 * N raws : number of texts into the corpus.
 * M columns : dimension of W2VEC vectorial space.

Number of columns is provided with attribute `w2vec.wv.vector_size`

In [51]:
X_train = build_embedding_matrix(w2vec, X_train)

In [53]:
y_train =  np.array(y_train)
print("Vectorized X_train and y_train shapes : {} / {}".format(X_train.shape,y_train.shape))

Vectorized X_train and y_train shapes : (362466, 100) / (362466,)


In [None]:
X_test = build_embedding_matrix(w2vec, X_test)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


5350
6600
7627
7861
11350
15967
17606
20185
23796
26204
33351
39953
42592
46615
47373
48015
48238
51659
56403
56526
57062
57858
63469
63594
64064
64477
64571
64797
67950
71508
74502
75261
76608
76776
77289
77625
79163
89305
91294
91468
91665
93132
93159
93962
95026
96100


# <font color=blue> 3. Applying estimators</font>

## 3.1 Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
y_predict = lr.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_predict)

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test, y_predict))
print("RMSE for Linear Regression: %.4f" % rmse)

In [None]:
#EPOCHS=10
#RMSE for Linear Regression: 0.1870

#EPOCHS=100
#RMSE for Linear Regression: 0.1833


## 3.2 XgBoost

In [1]:
import xgboost as xgb

In [None]:
xgbr = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.01,
                max_depth = 10, reg_lambda = 5, n_estimators = 500)

In [None]:
xgbr.fit(X_train, y_train)

In [None]:
y_preds = xgbr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_preds))
print("RMSE: %f" % (rmse))

#### Word2Vec is trained over `list_corpus` in order to retrieve words vectors from corpus.

In Word2Vec, 3 matrices occupy memory.
Then wize is : 

vocabulary * matrices * NN_layers * 8 bytes

In [None]:
import os
print(os.listdir('./data'))

In [None]:
model_w2vec.epochs

In [None]:
matrix.shape

In [None]:
import p5_util
file_name = './data/matrix.dump'
p5_util.object_dump(matrix, file_name)

In [None]:
from sklearn.linear_model import LinearRegression
pipeline_lr = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('scale', preprocessing.StandardScaler()),
    ('lr', LinearRegression()),
])


In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

In [None]:
lr.fit(matrix, list_label_cleaned)

In [None]:
np.where(matrix == zero_vec)[0].shape

In [None]:
list_corpus_label = [ (gensim.utils.simple_preprocess(X_test[index], deacc=False, min_len=2), y_test[index]) for index in X_test.index[:3000]]

list_corpus = [list_corpus_label[i][0] for i in range(0, len(list_corpus_label),1)]
list_label  = [list_corpus_label[i][1] for i in range(0, len(list_corpus_label),1)]

list_corpus_cleaned,list_label_cleaned = clean_empty_text(list_corpus, list_label)


In [None]:
test_matrix = build_embedding_matrix(model_w2vec, list_corpus_cleaned)

In [None]:
y_predict = lr.predict(test_matrix)

In [None]:
from sklearn.metrics import r2_score
r2_score(list_label_cleaned, y_predict)

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

rmse = np.sqrt(mean_squared_error(list_label_cleaned, y_predict))
print("RMSE for Linear Regression: %.4f" % rmse)

## Using MeanW2VecEmbedding class

In [None]:
X_train = X_train.tolist()
y_train = y_train.tolist()

In [None]:
epochs=50
model_filename = './data/model_w2vec_'+str(epochs)+'_epochs'

w2vec = get_trained_w2vec(model_filename, X_train, epochs)

In [None]:
w2vec.wv.vocab

In [None]:
meanW2VecEmbedding =  MeanW2VecEmbedding(w2vec)

In [None]:
X = X_train
y = y_train
range_index = range(0, len(X),1)

list_X_y = [ (gensim.utils.simple_preprocess(X[index], deacc=False, min_len=2), y[index]) \
            for index in range_index]
X = [list_X_y[i][0] for i in range(0, len(list_X_y),1)]
y = [list_X_y[i][1] for i in range(0, len(list_X_y),1)]
list_X, list_y = clean_X_label(X,y)


In [None]:
w2vec.wv.vocab

In [None]:
#list_X[0]
i=0
np.mean( [w2vec.wv[word] for word in list_X[i]  if word in w2vec.wv.vocab], axis=0 )
word = list_X[i][1]
w2vec.wv[word]

In [None]:
X = build_embedding_matrix(w2vec, list_X)

In [None]:
X_, y_ = meanW2VecEmbedding.transform(X_train, y_train)

X_.shape, y_.shape

In [None]:
X_test, y_test = meanW2VecEmbedding.transform(X_test, y_test)

In [None]:
y_test.shape

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [None]:
y_predict = lr_model.predict(X_test[:3000])

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test[:3000], y_predict))
print("RMSE for Linear Regression: %.4f" % rmse)

In [None]:
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression

pipeline_lr = Pipeline([
    ('meanW2VecEmbedding', MeanW2VecEmbedding(w2vec)),
    ('lr', LinearRegression()),
])


In [None]:
pipeline_lr = pipeline_lr.fit(X_train, y_train)

#### Get dictionary where each word is a key from corpus and each of the value is the word vector.

In [None]:
dict_word_vector = dict(zip(model_w2vec.wv.index2word, model_w2vec.wv.vectors))

In [None]:
for key, value in dict_word_vector.items() :
    print("Word2Vec sampling: word= '{}' / Vector length= {}".format(key,len(value)))
    break

#### Also, a vector may be retrieved from any word with operation : `word_vector = model_w2vec[word]`

In [None]:
vocab_size = len(model_w2vec.wv.vocab)
print("Vocabulary size= {}".format(vocab_size))

In [None]:
from keras.preprocessing.sequence import pad_sequences
sequences = [
[1, 2, 3, 4],
   [1, 2, 3],
     [1]]
# pad sequence
padded = pad_sequences(sequences, padding='post')
print(padded)

####  Building embedding matrix for each text

In [None]:
keras_embedding = model_w2vec.wv.get_keras_embedding(train_embeddings=False)

In [None]:
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D
from keras.layers import Flatten, Dense, Dropout, Embedding

vocab_size = len(model_w2vec.wv.vocab)
embedding_dim = model_w2vec.wv.vectors.view().shape[1]

max_len = 0
for i, j in ser_corpus.items():
    max_len = max(len(j),max_len)
max_len    

model = Sequential()
if False :
    model.add(Embedding(vocab_size, 
                        embedding_dim, 
                        input_length=max_len, 
                        weights = keras_embedding.get_weights(), 
                        trainable = False))
else :
    model.add(keras_embedding)

model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(3))

model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(3))

model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(3))

if False :
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(20, activation='softmax'))
model.summary()

In [None]:
model.add(Flatten())

#### Make memory more efficient; this implies no more training.

In [None]:
model_w2vec.init_sims(replace=True)

In [None]:
model_w2vec.most_similar(positive=['remember'], topn=5)

In [None]:
model_w2vec.predict_output_word(['gaf'], topn=10)

In [None]:
list_corpus[1]

In [None]:
import numpy as np

list_array_cnn_input = [[model_w2vec[word] for word in list_word] for list_word in list_corpus]
array_array_cnn_input = np.array(list_array_cnn_input)

In [None]:
type(array_array_cnn_input)

In [None]:
print(len(array_array_cnn_input))
array_array_cnn_input[10][100]

In [None]:
help(model_w2vec.wv)

In [None]:
help(model_w2vec)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier, LinearRegression
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor

pipeline_gbr = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('scale', preprocessing.StandardScaler(with_mean=False)),
    ('gbr', GradientBoostingRegressor()),
])


In [None]:
model = pipeline_gbr.fit(X_train, y_train)

In [None]:
help(TfidfTransformer)

In [None]:
import p6_util


In [None]:
help(p6_util.get_list_tag_stat_tfidf)

In [None]:
import lightgbm as lgb

In [None]:
import xgboost as xgb

In [None]:
help(xgb)