In [None]:
#Build a machine learning algorithm that can automate the process (Supervised Learning)
#Use correctly labelled data to build an algorithm that can suggest labels for unlabelled lines
#Classification problem
#Predictions will be probabilites for each label (between 0 and 1)

#Budget data: line-item: 'Algebra books for 8th grade student'; labels: 'textbooks', 'math', 'middle school'

In [None]:
#Exploring the data
import pandas as np
import matplotlib.pyplot as plt

df = pd.read_csv('TrainingData.csv')
df.info()
df.describe()

# Create the histogram
plt.hist(df['FTE'].dropna())

# Add title and labels
plt.title('Distribution of %full-time \n employee works')
plt.xlabel('% of full-time')
plt.ylabel('num employees')

# Display the histogram
plt.show()

#Datatypes 
df.dtypes()

#Encode strings as categories
df['column'] = df.column.astype('category')

#Dummy variable encoding
dummies = pd.get_dummies(df[['column']], prefix_sep = '_')

# Using lambda function
LABELS = ['Function',
 'Use',
 'Sharing',
 'Reporting',
 'Student_Type',
 'Position_Type',
 'Object_Type',
 'Pre_K',
 'Operating_Status']
categorise_labels = lambda x: x.astype('category')
df[LABELS] = df[LABELS].apply(categorise_labels, axis = 0)

#Counting unique labels
# Calculate number of unique values for each label: num_unique_labels
num_unique_labels = df[LABELS].apply(pd.Series.nunique)

# Plot number of unique values for each label
num_unique_labels.plot(kind = 'bar')

# Label the axes
plt.xlabel('Labels')
plt.ylabel('Number of unique values')

# Display the plot
plt.show()

In [None]:
#How do we measure if the algorithm works? (Meauring success)

# Computing log loss
def compute_log_loss(predicted, actual, eps = 1e-14):
    """ Computes the logarithmic loss between predicted and 
        actual when these are 1D arrays.
    
        :param predicted: The predicted probabilites a floats between 0-1
        :param actual: The actual binary labels. Either 0 or 1
        :param eps (optional): log(0) is inf, so we need to offset our predicted values slightly by eps from 0 to 1.
    """
    predicted = np.clip(predicted, eps, 1 - eps)
    loss = -1 * np.mean(actual * np.log(predicted))
              + (1 - actual)
              * np.log(1 - predicted)
            
    return loss


# Compute and print log loss for 1st case
correct_confident_loss = compute_log_loss(correct_confident, actual_labels)
print("Log loss, correct and confident: {}".format(correct_confident_loss)) 

# Compute log loss for 2nd case
correct_not_confident_loss = compute_log_loss(correct_not_confident, actual_labels)
print("Log loss, correct and not confident: {}".format(correct_not_confident_loss)) 

# Compute and print log loss for 3rd case
wrong_not_confident_loss = compute_log_loss(wrong_not_confident, actual_labels)
print("Log loss, wrong and not confident: {}".format(wrong_not_confident_loss)) 

# Compute and print log loss for 4th case
wrong_confident_loss = compute_log_loss(wrong_confident, actual_labels)
print("Log loss, wrong and confident: {}".format(wrong_confident_loss)) 

# Compute and print log loss for actual labels
actual_labels_loss = compute_log_loss(actual_labels, actual_labels)
print("Log loss, actual labels: {}".format(actual_labels_loss)) 

In [None]:
#Building a model

##First multi-classs logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier #treat each column of y independently, fitting separate classifier for each of the columns

# Split data to just contain numeric columns
data_to_train = df[NUMERIC COLUMNS].fillna(-1000)
labels_to_use = pd.get_dummies(df[LABELS])
X_train, X_test, y_train, y_test = multilabel_train_test_split(data_to_train, labels_to_use, size = 0.2, seed = 123)

# Training the model
clf = OneVsRestClassifier(LogisticRegression())
clf.fit(X_train, y_train)

# Print the accuracy
print("Accuracy: {}".format(clf.score(X_test, y_test)))

##Predicting on the holdout data
holdout = pd.read_csv('HoldoutData.csv', index_col = 0)
holdout = holdout[NUMERIC COLUMNS].fillna(-1000)
predictions = clf.predict_proba(holdout)

##Submitting predictions to CSV
prediction_df = pd.DataFrame(columns = pd.get_dummies(df[LABELS], prefix_sep = '__').columns,
                            index = holdout.index, data = predictions)
prediction_df.to_csv('predictions.csv')
score = score_submission(pred_path = 'predictions.csv')

In [None]:
#Natural Language Processing (NLP)
#Tokenisation: splitting strings into segements, store segments as lists
from sklearn.feature_extraction.text import CountVectorizer

TOKENS_BASIC = '\\\\S+(?=\\\\s+)'
df.Program_Description.fillna('', inplace = True)
vec_basic = CountVectorizer(token_pattern = TOKENS_BASC)

vec_basic.fit(df.Program_Description)
msg = 'There are {} tokens in Program Description if tokens are any non-whitesspace'
print(msg.format(len(vec_basic.get_feature_names())))


# Define combine_text_columns()
def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + LABELS):
    """ converts all text in each row of data_frame to single vector """
    
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis = 1)
    
    # Replace nans with blanks
    text_data.fillna('', inplace = True)
    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

# Create the basic token pattern
TOKENS_BASIC = '\\S+(?=\\s+)'

# Create the alphanumeric token pattern
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Instantiate basic CountVectorizer: vec_basic
vec_basic = CountVectorizer(token_pattern = TOKENS_BASIC)

# Instantiate alphanumeric CountVectorizer: vec_alphanumeric
vec_alphanumeric = CountVectorizer(token_pattern = TOKENS_ALPHANUMERIC)

# Create the text vector
text_vector = combine_text_columns(df)

# Fit and transform vec_basic
vec_basic.fit_transform(text_vector)

# Print number of tokens of vec_basic
print("There are {} tokens in the dataset".format(len(vec_basic.get_feature_names())))

# Fit and transform vec_alphanumeric
vec_alphanumeric.fit_transform(text_vector)

# Print number of tokens of vec_alphanumeric
print("There are {} alpha-numeric tokens in the dataset".format(len(vec_alphanumeric.get_feature_names())))

In [None]:
#Pipelines, feature & text preprocessing
#Pipeline is a repeatable way to go from raw data to trained model
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

pl = Pipeline([('imp', Imputer()), 
               ('clf', OneVsRestClassifier(Logistic Regression()))
              ])
X_train, X_test, y_train, y_test = train_test_split(df['numeric'], 
                                                    pd.get_dummies(df['label']),
                                                    random_state = 2)
pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)
print("Accuracy on sample data: ", accuracy)

##Preprocessing text features
from sklearn.feature_extraction.text import CountVectorizer

X_train, X_test, y_train, y_test = train_test_split(df['text'], 
                                                    pd.get_dummies(df['label']),
                                                    random_state = 2)

pl = Pipeline([('vec', CountVectorizer()), 
               ('clf', OneVsRestClassifier(Logistic Regression()))
              ])
pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)
print("Accuracy on sample data: ", accuracy)

##Using both numeric and text data in single pipeline
#Write two functions for pipeline preprocessing: 1. Take entire DataFrame, return numeric columns
#2. Take entire DataFrame, return text columns
#Then, can preprocess numeric and text data in separate pipelines
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion

X_train, X_test, y_train, y_test = train_test_split(df[['numeric', 'text']], 
                                                    pd.get_dummies(df['label']),
                                                    random_state = 2)

get_numeric_data = FunctionTransformer(lambda x: x['numeric'], validate = False)
get_text_data = FunctionTransformer(lambda x: x['text'], validate = False)

union = FeatureUnion([('numeric', numeric_pipeline),
                     ('text', text_pipeline)])

numeric_pipeline = Pipeline([('selector', get_numeric_data),
                            ('imputer', Imputer())])
text_pipeline = Pipeline([('selector', get_text_data),
                            ('vectoriser', CountVectorizer())])

pl = Pipeline([('union', FeatureUnion([('numeric', numeric_pipeline), ('text', text_pipeline)])),
              ('clf', OneVsRestClassifier(LogisticRegression())])



# Example Practice
# Split using ALL data in sample_df
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric', 'with_missing', 'text']],
                                                    pd.get_dummies(sample_df['label']), 
                                                    random_state=22)

# Create a FeatureUnion with nested pipeline: process_and_join_features
process_and_join_features = FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )

# Instantiate nested pipeline: pl
pl = Pipeline([
        ('union', process_and_join_features),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])


# Fit pl to the training data
pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on sample data - all data: ", accuracy)

In [None]:
#Using pipeline on School Budget dataset
import numpy as np
import pandas as pd

dummy_labels = pd.get_dummies(df[LABELS])
X_train, X_test, y_train, y_test = multilabel_train_test_split(
    df[NON_LABELS], dummy_labels, 0.2)


get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate = False)
get_text_data = FunctionTransformer(combine_text_columns, validate = False)

pl = Pipeline([
    ('union', FeatureUnion([
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data), 
            ('imputer', Imputer())])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectoriser', CountVectorizer())]))
    ])
    ),
    ('clf', OneVsRestClassifier(LogisticRegression()))])

# Pipeline code from example
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on budget dataset: ", accuracy)

#Edit model step in pipeline
from sklearn.ensemble import RandomForestClassifier
pl = Pipeline([
    ('union', FeatureUnion([
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data), 
            ('imputer', Imputer())])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectoriser', CountVectorizer())]))
    ])
    ),
    ('clf', RandomForestClassifier())])

In [None]:
#The Full Process

##Processing
from sklearn.feature_extraction.text import CountVectorizer

text_vector = combine_text_columns(X_train) #create the text vector
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)' #create token pattern
vec = CountVectorizer(token_pattern = TOKENS_ALPHANUMERIC, ngram_range = (1, 2))
vec.fit(text_vector)

##Pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import chi2, SelectKBest

# Select 300 best features
chi_k = 300

# Import functional utilities
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.pipeline import FeatureUnion

# Perform preprocessing
get_text_data = FunctionTransformer(combine_text_columns, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)

# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Instantiate pipeline: pl
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                   ngram_range=(1,2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

##Interaction terms (statistical tools), interaction terms mathematiically describe when tokens appear together
from sklearn.preprocessing import PolynomialFeatures

pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                   ngram_range=(1, 2))),  
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('int', SparseInteractions(degree = 2)),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

##Hashing
from sklearn.feature_extraction.text import HashingVectorizer

# Get text data: text_data
text_data = combine_text_columns(X_train)

# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)' 

# Instantiate the HashingVectorizer: hashing_vec
hashing_vec = HashingVectorizer(token_pattern = TOKENS_ALPHANUMERIC)

# Fit and transform the Hashing Vectorizer
hashed_text = hashing_vec.fit_transform(text_data)

# Create DataFrame and print the head
hashed_df = pd.DataFrame(hashed_text.data)
print(hashed_df.head())

In [None]:
#The Winning Pipeline

# Instantiate the winning model pipeline: pl
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                     non_negative=True, norm=None, binary=False,
                                                     ngram_range=(1,2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('int', SparseInteractions(degree=2)),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])