## From interactive programming to production ready code

### Imports

In [1]:
from luigi.contrib.external_program import ExternalProgramTask
from luigi.contrib.spark import PySparkTask
from luigi.parameter import IntParameter, Parameter
from luigi import LocalTarget, Task
import luigi

In [2]:
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

## Task No.1: Download the dataset

We want to download the dataset using "curl".
Luigi provides a baseclass named **ExternalProgramTask** to utilize external programs. 
It simply calls the external program with the provided commandline arguments. The output target can be referenced through *self.output()*.

*Input*: Nothing required <br>
*Output*: Downloaded dataset

In [3]:
class DownloadDataset(ExternalProgramTask):

    dataset_name = Parameter(default="reddit_ds_got")

    base_url = "http://plainpixels.work/resources/datasets"
    file_fomat = "zip"

    def output(self):
        return LocalTarget("../dataset/%s.%s" % (self.dataset_name, self.file_fomat))

    def program_args(self):
        url = "%s/%s.%s" % (self.base_url, 
                            self.dataset_name, 
                            self.file_fomat)
        return ["curl", "-L",
                "-o", self.output().path,
                url]

In [4]:
luigi.build([DownloadDataset()], local_scheduler=True, no_lock=True)

DEBUG: Checking if DownloadDataset(dataset_name=reddit_ds_got) is complete
INFO: Informed scheduler that task   DownloadDataset_reddit_ds_got_aed65dfc23   has status   DONE
INFO: Done scheduling tasks
INFO: Running Worker with 1 processes
DEBUG: Asking scheduler for work...
DEBUG: Done
DEBUG: There are no more tasks to run at this time
INFO: Worker Worker(salt=730457980, workers=1, host=Marks-MacBook-Pro-2.local, username=markkeinhorster, pid=5129) was stopped. Shutting down Keep-Alive thread
INFO: 
===== Luigi Execution Summary =====

Scheduled 1 tasks of which:
* 1 present dependencies were encountered:
    - 1 DownloadDataset(dataset_name=reddit_ds_got)

Did not run any tasks
This progress looks :) because there were no failed tasks or missing external dependencies

===== Luigi Execution Summary =====



True

## Task No.2: Extract the dataset

Just as before, we use **ExternalProgramTask** to unzip the archive. The major difference is that **ExtractDataset** now implements *requires(...)* and links to **DownloadDataset** as a dependency. The required target can be referenced through *self.input()*.

*Input*: DownloadDataset <br>
*Output*: A folder containing the raw data

In [5]:
class ExtractDataset(ExternalProgramTask):
    
    dataset_name = Parameter(default="reddit_ds_got")
    
    def requires(self):
        return DownloadDataset(self.dataset_name)

    def output(self):
        return LocalTarget("datasets/%s/raw" % self.dataset_name)

    def program_args(self):
        self.output().makedirs()
        return ["unzip", "-u", "-q",
                "-d", self.output().path,
                self.input().path]

In [6]:
luigi.build([ExtractDataset()], local_scheduler=True, no_lock=True)

DEBUG: Checking if ExtractDataset(dataset_name=reddit_ds_got) is complete
INFO: Informed scheduler that task   ExtractDataset_reddit_ds_got_aed65dfc23   has status   DONE
INFO: Done scheduling tasks
INFO: Running Worker with 1 processes
DEBUG: Asking scheduler for work...
DEBUG: Done
DEBUG: There are no more tasks to run at this time
INFO: Worker Worker(salt=557753216, workers=1, host=Marks-MacBook-Pro-2.local, username=markkeinhorster, pid=5129) was stopped. Shutting down Keep-Alive thread
INFO: 
===== Luigi Execution Summary =====

Scheduled 1 tasks of which:
* 1 present dependencies were encountered:
    - 1 ExtractDataset(dataset_name=reddit_ds_got)

Did not run any tasks
This progress looks :) because there were no failed tasks or missing external dependencies

===== Luigi Execution Summary =====



True

## Task No.3: Clean the dataset

The cleaning task, that takes care to tokenize the posts, remove stopwords and stem.

*Input*: ExtractDataset <br>
*Output*: A cleaned version of reddit posts

In [8]:
class Clean(Task):
    import nltk
    nltk.download('punkt')
    nltk.download('stopwords')

    dataset_name = Parameter(default="reddit_ds_got")
    training_data = "training/data.csv"
    
    # Der verwendete Tokenizer
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    
    # Die Liste von Stop-Woertern
    # die herausgefiltert werden
    stopwords = nltk.corpus.stopwords.words('english')
    
     # Der Stemmer fuer Englische Woerter
    stemmer = nltk.SnowballStemmer("english")
    
    # Als Abhaengigkeit wird der
    # Task *Download* zurueckgegeben
    def requires(self):
        return ExtractDataset(self.dataset_name)
    
    # Das LocalTarget fuer die sauberen Daten
    # Die Daten werden unter
    # "model/<version>/cleaned.csv gespeichert
    def output(self):
        return LocalTarget("datasets/%s/cleaned/cleaned.csv" % self.dataset_name)

    # Die Rohdaten werden zerstueckelt
    # durch die Stopwort-Liste gefiltert
    # und auf ihre Wortstaemme zurueckgefuehrt
    def run(self):
        import pandas
        data = "%s/%s" % (self.input().path, self.training_data)
        dataset = pandas.read_csv(data, encoding='utf-8', sep=';').fillna('')
        dataset["cleaned_words"] = dataset.apply(self.clean_words, axis=1)
        with self.output().open("w") as out:
            dataset[["cleaned_words", "subreddit"]].to_csv(out,  encoding='utf-8', index=False, sep=';')

    def clean_words(self, post):
        tokenized = self.tokenizer.tokenize(post["title"] + " " + post["selftext"])
        lowercase = [word.lower() for word in tokenized]
        filtered = [word for word in lowercase if word not in self.stopwords]
        stemmed = [self.stemmer.stem(word) for word in filtered]
        return " ".join(stemmed)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/markkeinhorster/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/markkeinhorster/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
luigi.build([Clean()], local_scheduler=True, no_lock=True)

DEBUG: Checking if Clean(dataset_name=reddit_ds_got) is complete
DEBUG: Checking if ExtractDataset(dataset_name=reddit_ds_got) is complete
INFO: Informed scheduler that task   Clean_reddit_ds_got_aed65dfc23   has status   PENDING
INFO: Informed scheduler that task   ExtractDataset_reddit_ds_got_aed65dfc23   has status   DONE
INFO: Done scheduling tasks
INFO: Running Worker with 1 processes
DEBUG: Asking scheduler for work...
DEBUG: Pending tasks: 1
INFO: [pid 5129] Worker Worker(salt=352907745, workers=1, host=Marks-MacBook-Pro-2.local, username=markkeinhorster, pid=5129) running   Clean(dataset_name=reddit_ds_got)
INFO: [pid 5129] Worker Worker(salt=352907745, workers=1, host=Marks-MacBook-Pro-2.local, username=markkeinhorster, pid=5129) done      Clean(dataset_name=reddit_ds_got)
DEBUG: 1 running tasks, waiting for next task to finish
INFO: Informed scheduler that task   Clean_reddit_ds_got_aed65dfc23   has status   DONE
DEBUG: Asking scheduler for work...
DEBUG: Done
DEBUG: There ar

True

## Task No.4: Train the model

Task No.4 trains the model and persists it to the filesystem.

*Input*: Clean <br>
*Output*: A file representing the model

Execute from commandline:
```bash
PYTHONPATH='.' luigi --module 00_training_pipeline TrainModel --version 1              --local-scheduler
```

In [73]:
class TrainModel(PySparkTask):
    dataset_name = Parameter(default="reddit_ds_got")
    version = IntParameter(default=1)
    
    # PySpark Parameter
    driver_memory = '1g'
    executor_memory = '2g'
    executor_cores = '2'
    num_executors = '4'
    master = 'local'
    
    # Als Abhaengigkeit wird der
    # Task *Clean* zurueckgegeben
    def requires(self):
        return Clean(self.dataset_name)
    
    # Das LocalTarget fuer das Model
    # Die Daten werden unter
    # "model/<version>/model gespeichert
    def output(self):
        return LocalTarget("model/%d" % self.version)

    def main(self, sc, *args):
        from pyspark.sql.session import SparkSession
        from pyspark.ml import Pipeline
        from pyspark.ml.feature import HashingTF, Tokenizer
        from pyspark.ml.classification import DecisionTreeClassifier
        
        # Initialisiere den SQLContext
        sql = SparkSession.builder\
            .enableHiveSupport() \
            .config("hive.exec.dynamic.partition", "true") \
            .config("hive.exec.dynamic.partition.mode", "nonstrict") \
            .config("hive.exec.max.dynamic.partitions", "4096") \
            .getOrCreate()
        
        # Lade die bereinigten Daten
        df = sql.read.format("com.databricks.spark.csv") \
            .option("header", "true") \
            .option("delimiter", ";") \
            .load(self.input().path)
        
        # Den Klassifikator trainieren
        labeled = df.withColumn("label", df.subreddit.like("datascience").cast("double"))
        train_set, test_set = labeled.randomSplit([0.8, 0.2])
        tokenizer = Tokenizer().setInputCol("cleaned_words").setOutputCol("tokenized")
        hashing = HashingTF().setNumFeatures(1000).setInputCol("tokenized").setOutputCol("features")
        decision_tree = DecisionTreeClassifier()
        pipeline = Pipeline(stages=[tokenizer, hashing, decision_tree])
        model = pipeline.fit(train_set)
        model.save(self.output().path)