## Bonus Question 2: Machine Learning Pipeline

In a new Jupyter Notebook, combine all your codes into a function (or a class). Your new function will execute the complete machine learning pipeline job by receiving the dataset location and output the classifier. This will allow you to use your function to predict the sentiment of any tweet in real time. 

In [1]:
# your code here
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
import nltk
import re
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet

In [2]:
# your code here

In [3]:
def clean_up(s):
    s = s.lower()
    # URL pattern found here https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
    url_pattern = '(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
    s = re.sub(url_pattern,'', s)
    s = re.sub('@','',s)
    s = re.sub('\W',' ',s)
    s = re.sub('\d','',s)
    s = s.rstrip()
    return s

def tokenize(s):
    return word_tokenize(s) 

def stem_and_lemmatize(l):
    ps = PorterStemmer()
    l_stemmed = [ps.stem(w) for w in l]
    lemmatizer = WordNetLemmatizer() 
    l_lemmatized = [lemmatizer.lemmatize(word) for word in l_stemmed]
    return l_lemmatized

def remove_stopwords(l):
    return [word for word in l if not word in stopwords.words()]

# Let's combine them all in one function

def full_clean_up(s):
    return remove_stopwords(stem_and_lemmatize(tokenize(clean_up(s))))



In [43]:
def tweet_machine_learning(data):
    # Process the tweet text
    data['text_processed'] = data['text'].apply(full_clean_up)
    
    # Bag of Words
    all_words = []
    for word in data['text_processed']:
        for w in word:
            all_words.append(w)
    
    all_words = nltk.FreqDist(all_words)
    top_words = [ x[0] for x in all_words.most_common(500)]
    
    # Create features
    feature_set = []
    for i in range(len(data['text_processed'])):
        if list(data['target'])[i] == 4:
            sentiment = 'positive'
        else:
            sentiment = 'negative'
        features = {}
        for w in top_words:
            features[w] = (w in list(data['text_processed'])[i])
        feature_set.append((features, sentiment))
    
#     list(data.apply(find_features, axis=1))
    feature_set_train = feature_set[:round(len(feature_set)/2)]
    feature_set_test = feature_set[round(len(feature_set)/2):]
    
    # Create and train the classifier
    classifier = nltk.NaiveBayesClassifier.train(feature_set_train)
    
    # I'll return the classifier to be able to use it
    # but we also need the top_words list to be able to create the features of any new tweet
    return (classifier, top_words)

Let's test the function on the actual data

In [54]:
tweets = pd.read_csv('C:/Users/ebour/Documents/()_Ironhack_DA_Bootcamp/()_Labs/sentiment140.csv').drop('flag', axis=1)

In [71]:
# I'll only take a small sample to test if it works
tweets_sample = tweets.sample(5000)

In [72]:
classifier_output = tweet_machine_learning(tweets_sample)
classifier = classifier_output[0]
top_words = classifier_output[1]

In [73]:
# I'll write a function to create the features of any new tweet that we would want to classify

def tweet_features(tweet):
    # Create clean list of words
    words = full_clean_up(tweet)
    
    # create empty features dictionnary
    features = {}
    
    # iterate through the top_word list and check if each of the words is in our tweet or not
    for w in top_words:
        features[w] = (w in words)
        
    # return the features dictionnary
    return features

In [74]:
# test of the different program elements:

tweet_test = "This is not a real tweet"

# new tweet features
test_features = tweet_features(tweet_test)

# new tweet classification based on features
classifier.classify(test_features)

'positive'

In [75]:
# I'll write a function that combines both operations done above, cleaning/tokening and classification

def classify_tweet(tweet):
    tweet_feat = tweet_features(tweet)
    classification = classifier.classify(tweet_feat)
    return classification

In [76]:
# test of the function on a new tweet (should end up being positive)
pos_tweet = "this is awesome"
classify_tweet(pos_tweet)

'positive'

In [77]:
# test of the function on a new tweet (should end up being negative)
neg_tweet = "this is sad"
classify_tweet(neg_tweet)

'negative'

## IT WORKS !!!