# Data Pipeline Workflow


In [4]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [5]:
import re
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [6]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values    
    print("X Values",X[:5])
    y = df.category.values    
    print("Y Values",y[:5])
    return X, y

def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### Step 1: Load data and perform a train test split

In [7]:
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder

# load data
X, y = load_data()

print("Label Categories",np.unique(y))

cat = LabelEncoder()

y_cat = cat.fit_transform(y)

print("Encoded Labels", np.unique(y_cat))

# perform train test split

X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=.25)

#print(y_train)

#print(y_test)


X Values [ 'Barclays CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  http://t.co/Ge9Lp7hpyG'
 'Barclays announces result of Rights Issue http://t.co/LbIqqh3wwG'
 'Barclays publishes its prospectus for its å£5.8bn Rights Issue: http://t.co/YZk24iE8G6'
 'Barclays Group Finance Director Chris Lucas is to step down at the end of the week due to ill health http://t.co/nkuHoAfnSD'
 'Barclays announces that Irene McDermott Brown has been appointed as Group Human Resources Director http://t.co/c3fNGY6NMT']
Y Values ['Information' 'Information' 'Information' 'Information' 'Information']
Label Categories ['Action' 'Dialogue' 'Information']
Encoded Labels [0 1 2]


### Step 2: Train classifier
* Fit and transform the training data with `CountVectorizer`. Hint: You can include your tokenize function in the `tokenizer` keyword argument!
* Fit and transform these word counts with `TfidfTransformer`.
* Fit a classifier to these tfidf values.

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
import numpy as np

# Instantiate transformers and classifier
# Bag of words approach ....word count 
vect = CountVectorizer(tokenizer=tokenize)


# the transformer applies weighting to frequency counts so that unique words more heavily characterize the document 
tfidf = TfidfTransformer()

clf = LogisticRegression()

# Fit and/or transform each to the data

X_count = vect.fit_transform(X_train)

#full vocabulary 
#print(vect.get_feature_names())

X_trans = tfidf.fit_transform(X_count)

print(X_count.shape)
print(X_count.toarray())

print(X_trans.shape)
print(X_trans.toarray())

print(np.unique(y_train))

model = clf.fit(X_trans, y_train)

# Number of columns equal to number of words in vocabulary, each column represents an individual word in vocab
# Number of rows ~ number of documents. In this case messages  


(1802, 5548)
[[0 0 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 ..., 
 [0 2 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
(1802, 5548)
[[ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.07676674  0.         ...,  0.          0.          0.        ]
 [ 0.          0.08673402  0.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.          0.10012939  0.         ...,  0.          0.          0.        ]
 [ 0.          0.05302439  0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]
[0 1 2]


### Step 3: Predict on test data


In [9]:
# Transform test data

X_test_count = vect.transform(X_test)

X_test_tfidf = tfidf.transform(X_test_count)

# Predict test labels
score = model.score(X_test_tfidf, y_test)

print(score)

y_pred = model.predict(X_test_tfidf)

0.890183028286


### Step 4: Display results
Display a confusion matrix and accuracy score based on the model's predictions.

In [10]:
print(y_test)

[2 2 2 2 0 2 2 2 2 2 2 0 2 0 2 1 2 0 0 2 2 0 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2
 2 2 2 2 2 2 0 2 2 2 1 2 2 2 2 2 0 2 2 0 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2
 2 0 0 2 2 2 0 2 2 2 2 2 2 2 2 0 0 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 0
 2 2 2 0 2 2 1 2 0 0 2 2 0 2 2 1 2 2 0 2 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 0 2
 2 0 2 2 2 1 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 0 2 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 2 2 0 0 0 2 2 2 2 2 2 2 0 0 2 2 2 0 2 2 0 2 0 2 2 0 2 1 2 2 2 2 2 2 2
 2 1 2 2 2 2 2 2 2 0 2 0 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 2 0
 2 2 2 2 2 0 2 2 2 0 2 1 0 0 0 2 2 0 2 2 2 2 2 2 2 2 1 2 2 2 2 2 0 2 0 2 2
 2 2 2 2 2 1 2 2 0 2 2 2 0 2 0 2 2 2 0 0 1 1 0 0 2 2 2 0 2 2 0 2 2 2 2 1 0
 2 0 2 2 2 0 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 0 2
 2 2 2 2 2 0 2 2 2 1 2 2 2 2 2 1 2 0 0 2 2 2 0 2 2 2 2 2 2 2 2 0 2 2 2 2 2
 2 0 2 1 2 1 0 0 2 1 0 0 2 0 2 2 2 2 2 1 0 2 2 2 0 1 2 2 2 0 2 2 2 2 2 0 2
 2 2 2 2 2 2 2 2 2 2 2 1 0 0 2 2 0 0 0 2 2 2 0 0 2 0 1 2 2 2 2 2 2 2 2 2 2
 2 2 0 0 1 2 2 0 0 2 2 2 

In [11]:
print(y_pred)

[2 2 2 2 0 2 2 2 2 2 2 0 2 2 2 1 2 2 2 2 2 0 2 2 2 2 2 2 1 0 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 0 2 2 2 2 0 2 2 2 2 2 2 2 2 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0
 2 2 2 0 2 2 1 2 2 0 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 0 2
 2 0 2 2 2 1 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 0 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 0 2 2 2 2 1 2 2 2 2 2 2 2
 2 1 2 2 2 2 2 2 2 2 2 0 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 0
 2 2 2 2 2 0 2 2 2 2 2 1 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 0 2 2
 2 2 2 2 2 1 2 2 2 2 2 2 0 2 0 2 2 2 0 0 1 2 2 0 2 2 2 2 2 2 0 2 2 2 2 1 0
 2 0 2 2 2 0 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 0 2
 2 2 2 2 2 0 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2
 2 0 2 2 2 2 0 2 2 2 2 0 2 0 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 0 2 2 0 2 0 1 2 2 2 2 2 2 2 2 2 2
 2 2 2 0 1 2 2 2 0 2 2 2 

In [12]:
print(np.unique(y_pred.round()))


[0 1 2]


In [13]:
from sklearn.metrics import confusion_matrix 

confusion_mat = confusion_matrix(y_test, y_pred)
accuracy = score 


print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

Confusion Matrix:
 [[ 63   0  55]
 [  1  23   8]
 [  2   0 449]]
Accuracy: 0.890183028286


# Final Step: Refactor



In [14]:
from sklearn.metrics import confusion_matrix 

def display_results():
    
    confusion_mat = confusion_matrix(y_test, y_pred)
    accuracy = model.score(X_test_tfidf, y_test)

    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)


def main():
    
    # Load corporate messages from csv, select, lemmatize, tokenize, split data (encode labels)
    
    X, y = load_data()
    
    # process label categories into integer categories 
    
    cat = LabelEncoder()

    y_cat = cat.fit_transform(y)

    print("Encoded Labels", np.unique(y_cat))

    # perform train test split

    X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=.25)
    
    # Process tokens,establish voabalulary and table of term freq, weight freq with transformer, implement logistic model
    
    vect = CountVectorizer(tokenizer=tokenize)
    
    tfidf = TfidfTransformer()

    clf = LogisticRegression()

    X_count = vect.fit_transform(X_train)

    X_trans = tfidf.fit_transform(X_count)

    # logistic regression model 
    
    model = clf.fit(X_trans, y_train)
    
    # perfrom the same transfromation on test data
    
    X_test_count = vect.transform(X_test)

    X_test_tfidf = tfidf.transform(X_test_count)

   # display results     
    
    display_results()
    
    

In [15]:
# run program
main()

X Values [ 'Barclays CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  http://t.co/Ge9Lp7hpyG'
 'Barclays announces result of Rights Issue http://t.co/LbIqqh3wwG'
 'Barclays publishes its prospectus for its å£5.8bn Rights Issue: http://t.co/YZk24iE8G6'
 'Barclays Group Finance Director Chris Lucas is to step down at the end of the week due to ill health http://t.co/nkuHoAfnSD'
 'Barclays announces that Irene McDermott Brown has been appointed as Group Human Resources Director http://t.co/c3fNGY6NMT']
Y Values ['Information' 'Information' 'Information' 'Information' 'Information']
Encoded Labels [0 1 2]
Confusion Matrix:
 [[ 63   0  55]
 [  1  23   8]
 [  2   0 449]]
Accuracy: 0.890183028286
