In [58]:
import pyspark as ps
import numpy as np
import pandas as pd
import time

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec

from functools import reduce

from __future__ import print_function

# Copy data from cloud storage to cluster hdfs

In [None]:
!hadoop distcp gs:///ukwiki_*.csv hdfs://

# Init spark

In [59]:
spark = SparkSession.builder.master("local").appName("WikiParse").getOrCreate()

# Download stop words and service words

In [None]:
sw_df = spark.read.csv("gs:///stop_words.csv").toPandas()
sr_df = spark.read.csv("gs:///service_words.csv").toPandas()

In [None]:
stop_words = sw_df['_c0'].tolist()
service_words = sr_df['_c0'].tolist()

# Read all of the training data

In [None]:
def read_batch(offset, limit):
    fractions = []
    for i in range(offset, limit):
        print("Downloading fraction number {}...".format(i))
        df = spark.read.format("csv") \
                        .option("header", "true") \
                        .option("encoding", "UTF-8") \
                        .load("hdfs:///ukwiki_" + str(i) + ".csv")
        fractions.append(df)
    return fractions

In [None]:
fractions = reduce(lambda x, y: x.union(y), read_batch(0, 91))

# Create training pipeline for word2vec model 

In [None]:
class Pipeline():
    def __init__(self, df, stop_words, service_words):
        self.stop_words = stop_words
        self.service_words = service_words
        self.df = df
        self.vector_df = None
    
    def fit(self, sample=1):
        word2Vec = Word2Vec(vectorSize=100, seed=42, inputCol='text', outputCol='model')
        
        if sample == 1:
            return word2Vec.fit(self.vector_df)
        
        print("Start sempling")
        data = self.vector_df.sample(False, sample, seed=0)
        print("End sempling")
        
        return word2Vec.fit(data)
    
    def preprocess(self):
        # clean data
        df_trip = self.df.select(['Title', 'Text'])\
            .withColumn('Text', regexp_replace('Text', '[§»«·&\~.a-zA-Z^=\-\"<>!?:;{}()\[\]/|%0-9\\\+\*#_]+', ' '))\
            .withColumn('Text', regexp_replace('Text', '\'{3}', ' '))\
            .withColumn('Text', regexp_replace('Text', '[—−]', ' '))\
            .withColumn('Text', regexp_replace('Text', '[^а-яА-ЯіІіІєЄҐґїЇ\s]', ''))\
            .withColumn('Text', regexp_replace('Text', '\s+', ' '))\
            .select([trim(lower(col('Title'))).alias('Title'), trim(lower(col('Text'))).alias('Text')])
        
        # tokenize data
        tokenizer = Tokenizer(inputCol="Text", outputCol="Vector")
        self.vector_df = tokenizer.transform(df_trip).select("vector")
        self.vector_df.show(5)
        
        # remove stop words
        self.vector_df = self.__remove_stop_words("vector", "vector_no_stopwords", self.stop_words)
        
        # remove service words
        self.vector_df = self.__remove_stop_words("vector_no_stopwords", "text", self.service_words)
        
    def __remove_stop_words(self, in_col, out_col, words_list):
        remover = StopWordsRemover(inputCol=in_col, outputCol=out_col, stopWords=words_list)
        stopwords = remover.getStopWords() 
        
        vector_no_stopw_df = remover.transform(self.vector_df).select(out_col)
        vector_no_stopw_df.show(5)
        
        return vector_no_stopw_df
    
    @staticmethod
    def from_batch(batch, stop_words, service_words):
        return Pipeline(batch, stop_words, service_words)

In [None]:
p = Pipeline.from_batch(fractions, stop_words, service_words)
p.preprocess()

# Training word2vec model over fraction of dataset

In [None]:
for i in range(1, 101, 5):
    fraction = i/100
    print("Training {} fraction".format(fraction))
    model = p.fit(sample=i/100)
    print("Saving word2vec of {} fraction".format(fraction))
    model.write().save("hdfs:///w2v/word2vec_{}".format(i))
    print("Copying word2vec model to gcloud storage")
    !eval {"hadoop distcp hdfs:///w2v/word2vec_{i} gs:///w2v/word2vec_{i}".format(i=i)}