## Exploring the data

In [3]:
import pandas as pd

# read from csv
df = pd.read_csv('urls.csv', names=['url','type'])
df.head(5)

FileNotFoundError: File b'urls.csv' does not exist

Our data set contains urls from [kino.to](http://kino.de) and [imdb](https://www.imdb.com).
Each instance is assigned to a type of content.

Lets see inspect the types in more detail....

In [None]:
df.describe()

The data set contains **11 unique urls**. The urls are assigned to **2 different types** of content.

In [None]:
df['type'].unique()

## Feature Extraction

Actually our urls are just strings.
We want to give this string some semantical meaning.

A url contains many different parts:
* protocol
* hostname
* port
* path
* query
* ....

Let's try to design a transformer to strip down these strings....

In [None]:
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from urllib.parse import urlparse

class UrlDecomposer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, urls):
        features = np.recarray(shape=(len(urls),),
                   dtype=[('domain', object),('path', object), ('port', object), ('params', object)])
        
        for i, value in enumerate(urls):
            parsed = urlparse(value)
            features['domain'][i] = parsed.hostname
            features['path'][i] = parsed.path
            features['port'][i] = parsed.port
            features['params'][i] = parsed.query

        return features
    
    
decomposer = UrlDecomposer()
decomposed = decomposer.fit_transform(df['url'])
decomposed

For a better presetation wrap into numpy dataframe

In [None]:
composed_df = pd.DataFrame(data=decomposed, columns=decomposed.dtype.names)
composed_df

Create count vector for features

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# analyse words and build vectors for 1-2 ngrams
domain_vector_transformer = CountVectorizer(analyzer='word', ngram_range=(1, 2))

# build vector from 'domain'-Series
sparse_matrix = domain_vector.fit_transform(composed_df['domain'])

domain_vector_df = pd.DataFrame(data=sparse_matrix.toarray(), columns=domain_vector.get_feature_names())
domain_vector_df

Building vector for the path

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# build path fragments
def path_tokenizer(text):
    return [tok for tok in text.split("/")]

# strip slashes left and right
# lower case the path
def path_preprocess(path):
    return path.strip().strip('/').lower()

path_vector_transformer = CountVectorizer(analyzer='word',
                                tokenizer=path_tokenizer,
                                preprocessor=path_preprocess,
                                ngram_range=(1, 2))

# build vector from 'path'-Series
sparse_matrix = path_vector_transformer.fit_transform(composed_df['path'])

path_vector_df = pd.DataFrame(data=sparse_matrix.toarray(), columns=path_vector_transformer.get_feature_names())
path_vector_df.head()

Build a pipeline

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, data_dict):
        return data_dict[self.key]

In [None]:
from sklearn.pipeline import Pipeline

domain_vector_transformer = CountVectorizer(analyzer='word', ngram_range=(1, 2))

domain_pipeline = Pipeline([
    ('urldecomposing', UrlDecomposer()), # decompose the urls
    ('domain_pipeline', Pipeline([
        ('select_domain', FeatureSelector('domain')), # select the dommain feature
        ('vect_domain', domain_vector_transformer) # build the vector from this feature
    ]))
])


sparse_matrix = domain_pipeline.fit_transform(df['url'])    

domain_vector_df = pd.DataFrame(data=sparse_matrix.toarray(), columns=domain_vector_transformer.get_feature_names())
domain_vector_df.head()

## Putting all together

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

def path_tokenizer(text):
    return [tok for tok in text.split("/")]

def path_preprocess(path):
    return path.strip().strip('/').lower()

# define vetorizer 
domain_vector_transformer = CountVectorizer(analyzer='word',
                                            ngram_range=(1, 2))

path_vector_transformer = CountVectorizer(analyzer='word',
                                tokenizer=path_tokenizer,
                                preprocessor=path_preprocess,
                                ngram_range=(1, 2))

# define pipeline
main_pipeline = Pipeline([
    ('url_decomposer', UrlDecomposer()),
    ('feature_union', FeatureUnion([
        ('domain', Pipeline([
            ('select_domain', FeatureSelector(key='domain')),
            ('domain_vector', domain_vector_transformer)
        ])),
        ('path', Pipeline([
            ('select_path', FeatureSelector(key='path')),
            ('path_vector', path_vector_transformer)
        ]))
    ])),
    ('classifier', KNeighborsClassifier(n_neighbors=2))
])

# read data
df = pd.read_csv('urls.csv', names=['url', 'type'])

# split data into train & test data
train, test= train_test_split(df, random_state=0)

# train the model
main_pipeline.fit(train['url'], train['type'])

# predict unkown values form test data set
result = main_pipeline.predict(test['url'])


pd.DataFrame(data=[result, test['type'], test['url']], index=["predicted", "actual", "url"])