# From Raw Data to Predictions

This notebook is designed as a follow-up to the [XXXXXXX]() course on Datacamp. We won't explain all the tools and techniques we use here, so if have completed this course, everything should make sense.

Next steps

In [1]:
%matplotlib inline
from __future__ import division
from __future__ import print_function

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import os
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

from data.multilabel import multilabel_sample_dataframe, multilabel_train_test_split
from features.SparseInteractions import SparseInteractions
from models.metrics import multi_multi_log_loss

# Load Data



In [4]:
path_to_training_data = os.path.join(os.pardir,
                                     'data',
                                     'TrainingSet.csv')

df = pd.read_csv(PATH_TO_TRAINING_DATA, index_col=0)

In [5]:
# Sample Data

In [6]:
LABELS = ['Function',
          'Use',
          'Sharing',
          'Reporting',
          'Student_Type',
          'Position_Type',
          'Object_Type', 
          'Pre_K',
          'Operating_Status']

NON_LABELS = [c for c in df.columns if c not in LABELS]

SAMPLE_SIZE = 10000

sampling = multilabel_sample_dataframe(df,
                                       pd.get_dummies(df[LABELS]),
                                       size=SAMPLE_SIZE,
                                       min_count=25,
                                       seed=43)

dummy_labels = pd.get_dummies(sampling[LABELS])
X_train, X_test, y_train, y_test = multilabel_train_test_split(sampling[NON_LABELS],
                                                               dummy_labels,
                                                               0.2,
                                                               min_count=3,
                                                               seed=43)

In [7]:
# Visualization

In [8]:
# Model

In [9]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_selection import chi2, SelectKBest

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import Imputer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MaxAbsScaler

from sklearn.model_selection import GridSearchCV

In [10]:
NUMERIC_COLUMNS = ['FTE', "Total"]

TOKENS_BASIC = '\\S+(?=\\s+)'  # tokens can have any non-whitespace character
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'  # tokens are only alpha-numeric

def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + LABELS):
    """ Takes the dataset as read in, drops the non-feature, non-text columns and
        then combines all of the text columns into a single vector that has all of
        the text for a row.
        
        :param data_frame: The data as read in with read_csv (no preprocessing necessary)
        :param to_drop (optional): Removes the numeric and label columns by default.
    """
    # drop non-text columns that are in the df
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis=1)
    
    # replace nans with blanks
    text_data.fillna("", inplace=True)
    
    # joins all of the text items in a row (axis=1)
    # with a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)


In [11]:
from sklearn.preprocessing import FunctionTransformer

dummy_labels = pd.get_dummies(sampling[LABELS])
X_train, X_test, y_train, y_test = multilabel_train_test_split(sampling[NON_LABELS],
                                                               dummy_labels,
                                                               0.2,
                                                               min_count=3,
                                                               seed=43)
# changed for actual text pre-processing
get_text_data = FunctionTransformer(combine_text_columns, validate=False)

get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)

In [12]:
from sklearn.metrics.scorer import make_scorer


log_loss_scorer = make_scorer(multi_multi_log_loss)


In [13]:
chi_k = 300

pl3 = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                     non_negative=True, norm=None, binary=False,
                                                     ngram_range=(1, 2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('int', SparseInteractions(degree=2)),
        ('scale', MaxAbsScaler()),
        ('clf', GridSearchCV(
                    OneVsRestClassifier(
                        LogisticRegression()
                    ),
                    scoring=log_loss_scorer,
                    param_grid=dict(estimator__C=[0.1, 1])
                 )
        )
    ])


pl3.fit(X_train, y_train.values)
pl3.score(X_test, y_test.values)

1.9849517886609855

In [119]:
sample_df.label.value_counts()

b    500
a    500
Name: label, dtype: int64

In [120]:
import numpy as np
import pandas as pd

rng = np.random.RandomState(123)

SIZE = 1000

sample_data = {
    'numeric': rng.normal(0, 10, size=SIZE),
    'text': rng.choice(['', 'foo', 'bar', 'foo bar', 'bar foo'], size=SIZE),
    'with_missing': rng.normal(loc=3, size=SIZE)
}

sample_df = pd.DataFrame(sample_data)

sample_df.loc[rng.choice(sample_df.index, size=np.floor_divide(sample_df.shape[0], 5)), 'with_missing'] = np.nan

foo_values = sample_df.text.str.contains('foo') * 10
bar_values = sample_df.text.str.contains('bar') * -25
no_text = ((foo_values + bar_values) == 0) * 1

val = 2 * sample_df.numeric + -2 * (foo_values + bar_values + no_text) + 4 * sample_df.with_missing.fillna(3)
val += rng.normal(0, 8, size=SIZE)

sample_df['label'] = np.where(val > np.median(val), 'a', 'b')

# Import splitting and pipeline objects from sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Import elements for simple pipeline from sklearn 
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

SPLIT_SEED = 22

# Split and select numeric data only, no nans 
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric']],
                                                    pd.get_dummies(sample_df['label']), 
                                                    random_state=SPLIT_SEED)

# Instantiate Pipeline object as pl
pl = Pipeline([
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

# Fit pipeline on sample data training splits
pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
accuracy

0.62

In [121]:

from sklearn.preprocessing import Imputer

# just numeric data
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric', 'with_missing']],
                                                    pd.get_dummies(sample_df['label']),
                                                    random_state=SPLIT_SEED)


pl = Pipeline([
        ('imp', Imputer()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

pl.fit(X_train, y_train)
pl.score(X_test, y_test)

0.62

In [122]:
from sklearn.feature_extraction.text import CountVectorizer

# just text data
X_train, X_test, y_train, y_test = train_test_split(sample_df['text'],
                                                    pd.get_dummies(sample_df['label']),
                                                    random_state=SPLIT_SEED)


pl = Pipeline([
        ('vec', CountVectorizer()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

pl.fit(X_train, y_train)
pl.score(X_test, y_test)

0.81599999999999995

In [123]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion

# now, with all data
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric', 'with_missing', 'text']],
                                                    pd.get_dummies(sample_df['label']),
                                                    random_state=SPLIT_SEED)

# simple selector transforms
get_text_data = FunctionTransformer(lambda x: x['text'], validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[['numeric', 'with_missing']], validate=False)

pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

pl.fit(X_train, y_train)
pl.score(X_test, y_test)

0.92800000000000005