# Preprocessing and Text Classification

## Preprocessing

# load the twitter samples corpus from nltk and compute avg length of tweets in corpus

In [3]:
import nltk
nltk.download('twitter_samples')
from nltk.corpus import twitter_samples
corpus=twitter_samples.strings()

def tweet_avg(corpus):
    avg=float(sum((map(len,corpus))))/float(len(corpus)) # use of maps instead of iterating
    return avg

def tweet_avg_iterate(corpus):
    sum_len_tweets=0
    for tweet in corpus:
        sum_len_tweets+=len(tweet) #iterating over the corpus to get length of tweets

    avg=float(sum_len_tweets)/float(len(corpus))
    return avg
        

print(str(tweet_avg_iterate(corpus)))



[nltk_data] Error loading twitter_samples: <urlopen error [Errno 8]
[nltk_data]     nodename nor servname provided, or not known>


LookupError: 
**********************************************************************
  Resource [93mtwitter_samples[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('twitter_samples')
  [0m
  Searched in:
    - '/Users/danielgil/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/Users/danielgil/anaconda/nltk_data'
    - '/Users/danielgil/anaconda/share/nltk_data'
    - '/Users/danielgil/anaconda/lib/nltk_data'
**********************************************************************


Extract hashtags using regular expressions. Let's consider all hashtags of length 8 or longer which consist only of lower case letters. In addition, the hashtag might occur at the beginning or the end of the tweet

In [None]:
import re

reg_init=re.compile(r'^#([a-z]{8,})\s') #boundaries are either space, the beginning or end of tweet
reg_middle=re.compile(r'\s#([a-z]{8,})\s')
reg_final=re.compile(r'\s#([a-z]{8,})$')


topics=[]
for tweet in corpus:
    # capture the tweets at the begining, middle and end of tweet
    topics=topics+re.findall(reg_init,tweet)+re.findall(reg_middle,tweet)+re.findall(reg_final,tweet)

print(len(topics))

Tokenise the hashtags. For this, instead of using a nltk tokenizer I will implement a reversed version of the MaxMatch algorithm, where matching begins at the end of the hashtag and progresses backwards (using NLTK list of words that for matching.

In [None]:
from nltk.corpus import wordnet as wn
import sys
words = nltk.corpus.words.words() # words is a Python list
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

tokenised_hashtags=[] # list of list with tokenised hashtags

## code below based on the lemmatizer WSTA_N1B_preprocessing.ipynb modified to include adjectives and adverbs
def lemmatize_word(word):
    # try to lemmatize the word as verb, if not try with noun, adjective or adverb
    lemma = lemmatizer.lemmatize(word,wn.VERB)
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,wn.NOUN)
        if lemma == word:
            lemma=lemmatizer.lemmatize(word,wn.ADJ)
            if lemma == word:
                lemma=lemmatizer.lemmatize(word,wn.ADV)
    
    return lemma
## End of referenced code

## Code below based in maxmatch algorithm on book Speech and Language Processing, pg.15
def maxmatch(topic_word,tokenlist):

    # if the word is empty we return the same string as the algorithm finished
    if not topic_word:
        return tokenlist
    
    # Get the first word and reminder
    current_word=topic_word[0:len(topic_word)]
    remainder=topic_word[0:0]
    
    for i in range(len(topic_word)):
        current_word=topic_word[i:len(topic_word)] # Reading backwards and get reminder
        remainder=topic_word[0:i]
        
        lemma=lemmatize_word(current_word) # get the lemma to search in word dictionary
        
        if lemma in words:
            tokenlist.append(current_word)
            return maxmatch(remainder,tokenlist) # recursive maxmatch algorithm with remainder
            
            
    # if we can´t find a match with wordnet we will add the first letter and continue
    current_word=topic_word[len(topic_word)-1]
    remainder=topic_word[:len(topic_word)-1]
    tokenlist.append(current_word)
    
    return maxmatch(remainder,tokenlist)
 ## End of algortithm reference    
    
for topic in topics: # go through every topic to create the list of lists
    token_list=[]
    token_list=maxmatch(topic,token_list) 
    tokenised_hashtags.append(token_list)
        
        
tokenised_hashtags[-20:] # print the last 20

Implement the forward version of the MaxMatch algorithm and print out all the hashtags which give different results for the two versions of MaxMatch. 

In [None]:
from nltk.corpus import wordnet as wn
import sys
words = nltk.corpus.words.words() # words is a Python list

tokenised_hashtags_forward=[] # list of list with tokenised hashtags

## Code below based in maxmatch algorithm on book Speech and Language Processing, pg.15
def maxmatch_forward(topic_word,tokenlist):

    # if the word is empty we return the same string as the algorithm finished
    if not topic_word:
        return tokenlist
    
    # Get the first word and reminder
    current_word=topic_word[0:len(topic_word)]
    remainder=topic_word[0:0]
    
    for i in range(len(topic_word)):
        current_word=topic_word[0:len(topic_word)-i] # Reading backwards and get reminder
        remainder=topic_word[len(topic_word)-i:len(topic_word)]
        lemma=lemmatize_word(current_word) # get the lemma to search in word dictionary
        
        if lemma in words:
            tokenlist.append(current_word)
            return maxmatch_forward(remainder,tokenlist) # recursive maxmatch algorithm with remainder
            
            
    # if we can´t find a match with wordnet we will add the first letter and continue
    current_word=topic_word[i]
    remainder=topic_word[i:len(topic_word)-1]
    tokenlist.append(current_word)
    
    return maxmatch_forward(remainder,tokenlist)
 ## End of algortithm reference    
  
for topic in topics: # go through every topic to create the list of lists
    token_list_forward=[]
    token_list_forward=maxmatch_forward(topic,token_list_forward) 
    tokenised_hashtags_forward.append(token_list_forward)
    
        
tokenised_hashtags_forward[-20:] # print the las 20

I would use a mix of the two algorithms finding the best lemma for the use case I need. E.g. Start with the forward maxmatch and look for the lemma, if it satisfies, for example, it's a verb then continue with the forward, if not, try backwards to see if the new word can have a better fit. 

## Text classification

The twitter_sample corpus has two subcorpora corresponding to positive and negative tweets. Let's iterate through these two corpora and build training, development, and test sets for use with Scikit-learn. I will exclude stopwords (from the built-in NLTK list) and tokens with non-alphabetic characters (emoji). I will also use 80% of the tweets for training, 10% for development, and 10% for testing in a <i>stratified</i> way.

In [None]:
from nltk.corpus import stopwords
import random

positive_tweets = nltk.corpus.twitter_samples.tokenized("positive_tweets.json") 
negative_tweets = nltk.corpus.twitter_samples.tokenized("negative_tweets.json")

    
# remove stop words
stop_words = set(stopwords.words('english'))
reg=re.compile('[^a-zA-Z]')
filtered_positive_tweets=[]
filtered_negative_tweets=[]

# preprocess positive and negative tweets
# remove stop words and words that are not matching lowercase letters
for tweet in positive_tweets:        
    filtered_positive_tweets.append([word for word in tweet if not word in stop_words and not re.match(reg,word)])

for tweet in negative_tweets:
    filtered_negative_tweets.append([word for word in tweet if not word in stop_words and not re.match(reg,word)])

# randomize the tweets to split the train, dev and test sets 
# first, positives tweets
random.shuffle(filtered_positive_tweets)

# rules to assign train=80%, dev=10% and test=10%
data_offset_train=int(round(len(filtered_positive_tweets)*0.80))
data_offset_dev=int(round(data_offset_train+(len(filtered_positive_tweets)*0.10)))

positive_tweets_train_set=filtered_positive_tweets[0:data_offset_train]
positive_tweets_dev_set=filtered_positive_tweets[data_offset_train:data_offset_dev]
positive_tweets_test_set=filtered_positive_tweets[data_offset_dev:len(filtered_positive_tweets)]


# second, negatives tweets
random.shuffle(filtered_negative_tweets)

# rules to assign train=80%, dev=10% and test=10%
data_offset_train=int(round(len(filtered_negative_tweets)*0.80))
data_offset_dev=int(round(data_offset_train+(len(filtered_negative_tweets)*0.10)))

negative_tweets_train_set=filtered_negative_tweets[0:data_offset_train]
negative_tweets_dev_set=filtered_negative_tweets[data_offset_train:data_offset_dev]
negative_tweets_test_set=filtered_negative_tweets[data_offset_dev:len(filtered_negative_tweets)]



# create the dataset for traing, dev and test
tweets_train_set=positive_tweets_train_set+negative_tweets_train_set
random.shuffle(tweets_train_set)
tweets_dev_set=positive_tweets_dev_set+negative_tweets_dev_set
random.shuffle(tweets_dev_set)
tweets_test_set=positive_tweets_test_set+negative_tweets_test_set
random.shuffle(tweets_dev_set)



Now, let's build some classifiers. Here, we'll be comparing Naive Bayes and Logistic Regression. For each, I will first find a good value for their main regularisation (hyper)parameters using the development set.

<h3>Process</h3>
<b>
1. Prepare the data and get the Document-Term Matrix for training, dev and test datasets
2. Train models (Naive Bayes and Logistic Regression) and compare 
3. Test different parameters to understand how accuracy change

</b>

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix




vectorizer=DictVectorizer()

# feature extraction with bag of words
def get_BOW_lowered(tweet):
    BOW = {}
    for word in tweet:
        word = word.lower()
        BOW[word] = BOW.get(word,0) + 1
    
    return BOW

# prepare the data and get the Document-Term Matrix for training, dev and test datasets
def prepare_training_data(feature_extractor):
    feature_matrix = []
    classifications = []
    for tweet in tweets_train_set: # go through every document in the training set
        feature_dict = feature_extractor(tweet) #extract features from document
        feature_matrix.append(feature_dict) # add features from document to a matrix
        if tweet in filtered_negative_tweets: # builds a list with labels
            classifications.append("negative") 
        else:
            classifications.append("positive")
            
    # Create a document-term matrix learning the vocabulary
    training_dtm = vectorizer.fit_transform(feature_matrix)
    
    return training_dtm,classifications

def prepare_dev_data(feature_extractor):
    feature_matrix = []
    classifications = []
    for tweet in tweets_dev_set: # go through every document in the dev set 
        feature_dict = feature_extractor(tweet) #extract features from document
        feature_matrix.append(feature_dict) # add features from document to a matrix

        if tweet in filtered_negative_tweets: # build a list with labels
            classifications.append("negative") 
        else:
            classifications.append("positive")
  
    # Create a document-term matrix
    dev_dtm = vectorizer.transform(feature_matrix) 
    
    return dev_dtm,classifications



def prepare_test_data(feature_extractor):
    feature_matrix = []
    classifications = []
    for tweet in tweets_test_set: # go through every document
        feature_dict = feature_extractor(tweet) #extract features from document
        feature_matrix.append(feature_dict) # add features from document to a matrix
        
        if tweet in filtered_negative_tweets: # build a list with labels
            classifications.append("negative") 
        else:
            classifications.append("positive")    

    # create a document-term matrix
    test_dtm = vectorizer.transform(feature_matrix) 
    
    return test_dtm,classifications


def check_results(predictions, classifications):
    print("Accuracy:")
    print(accuracy_score(classifications,predictions))
    print(classification_report(classifications,predictions))
    print(confusion_matrix(classifications, predictions))
    



<h4>Train models (Naive Bayes and Logistic Regression) and compare</h4>

In [None]:
#def prepare_training_data():
    

In [None]:
from sklearn.linear_model import LogisticRegression
# fit a logistic regression model to the data 
model = LogisticRegression()

# get the train data
X_train_dtm, y_train=prepare_training_data(get_BOW_lowered)
model.fit(X_train_dtm, y_train)
y_predicted_class = model.predict(X_train_dtm)

# view results for Logistic Regression not tuned - training dataset
print("Logistic Regression not tuned - Training")
check_results(y_predicted_class,y_train)

# test the model with dev dataset
X_dev_dtm,y_dev=prepare_dev_data(get_BOW_lowered)
y_predicted_class = model.predict(X_dev_dtm)

print("-----------------------------------------")

# view results for Logistic Regression not tuned - dev dataset
print("Logistic Regression not tuned - Dev")
check_results(y_predicted_class,y_dev)

In [None]:
from sklearn.naive_bayes import MultinomialNB

# train fit a Naive Bayes model 
model = MultinomialNB()
X_train_dtm, y_train=prepare_training_data(get_BOW_lowered)

# train the model using X_train_dtm 
model.fit(X_train_dtm, y_train)
y_predicted_class = model.predict(X_train_dtm)
print("Naive Bayes not tuned  - Training")
check_results(y_predicted_class,y_train)

# test the model with dev dataset
X_dev_dtm,y_dev=prepare_dev_data(get_BOW_lowered)
y_predicted_class = model.predict(X_dev_dtm)

# view results for Naive Bayes not tuned - dev dataset
print("Naive Bayes not tuned  - Dev")
check_results(y_predicted_class,y_dev)

<h5>Conclusions</h5>
<b>The model is performing relatively well with training data but not so good with the development dataset. With the actual parameters the model is overfitting. I will try to use regularization to avoid overfitting tuning the complexity of the model. I'll focus first trying with different values for parameter $\lambda$ in L2 Regularization for LogisticRegression and $\alpha$ to smooth Naive Bayes and remove $0$ probabilites.</b>

<h4>Test different parameters to understand how accuracy change</h4>

<b>Model hyper-parameter $C = \frac{1}{\lambda}$ controls how the complexity is handled. I will sample values of $C$ low and close to $0$ so the model can increase in complexity assigning big values to the weights for each parameter in the model. I will also decrease $\lambda$ using higher values of $C$ to simplify the model and compare the accuracy for training vs dev dataset.</b>


In [None]:
import numpy as np
import pandas as pd

param_C=list(np.power(10.0, np.arange(-15, 15))) # a vector with c values
df_params=pd.DataFrame(columns=['C','Training Accuracy','Dev Accuracy']) # create a dataframe to show results and comparison
row=len(df_params)
for c in param_C:
    model=LogisticRegression(C=c) # A small improve is seen when we eliminate intercept (bias)
    # train the model using X_train_dtm and make predictions with training data
    model.fit(X_train_dtm, y_train)
    y_train_predicted_class = model.predict(X_train_dtm)
    
    # make predictions with dev data
    y_dev_predicted_class = model.predict(X_dev_dtm)
    
    # save results in dataframe
    df_params.loc[row+1]=[c,accuracy_score(y_train,y_train_predicted_class),accuracy_score(y_dev,y_dev_predicted_class)]
    row=row+1
    
best_c=(df_params[df_params['Dev Accuracy'] == max(df_params['Dev Accuracy'])]).iat[0,0]
print(df_params)
print("Best parameter C:" +str(best_c))

<b>The maximum accuracy for our dev and training dataset is shown on the table (changing the fit_intercept may improve a bit).
I tried other parameters but the performance was not improved.<br/> I will use different approaches to tune the parameters after perform the same operation with Naive Bayes.</b>

In [None]:
# vector with alpha values
param_alpha=list(np.arange(0.1,5,0.2))
df_params=pd.DataFrame(columns=['Alpha','Training Accuracy','Dev Accuracy'])
row=len(df_params)
for alpha in param_alpha:
    model=MultinomialNB(alpha=alpha,fit_prior=True) # fit_prior = False decrease the accuracy for dev set
    # train the model using X_train_dtm 
    model.fit(X_train_dtm, y_train)
    # make predictions with training data
    y_train_predicted_class = model.predict(X_train_dtm)
    # make predictions with dev data
    y_dev_predicted_class = model.predict(X_dev_dtm)
    df_params.loc[row+1]=[alpha,accuracy_score(y_train,y_train_predicted_class),accuracy_score(y_dev,y_dev_predicted_class)]
    row=row+1
    
best_alpha=(df_params[df_params['Dev Accuracy'] == max(df_params['Dev Accuracy'])]).iat[0,0]
print(df_params)
print("Best parameter alpha:" +str(best_alpha))

<b>The maximum accuracy for our dev and training dataset is shown on the table.
(I tried other parameters but the performance was not improved).<br/> 

<h4>5. Test different approaches for tunning </h4>

In [None]:
# Grid Search to find the best parameters
import numpy as np
from sklearn.model_selection import GridSearchCV

param_C=list(np.power(10.0, np.arange(-15, 15))) # a list of Cs params to include in the parameter grid
fit_intercept_params=[True,False]

# create and fit a logistic regression model, testing each C param with different fit intercepts
model = LogisticRegression()
grid = GridSearchCV(estimator=model, param_grid=dict(C=param_C,fit_intercept=fit_intercept_params) )
grid.fit(X_train_dtm, y_train)
y_dev_predicted_class=grid.predict(X_dev_dtm)

# summarize the results of the grid search
print("Best accuracy: "+str(grid.best_score_))
print("Best estimator C: "+str(grid.best_estimator_.C))
print("Best parameters: "+str(grid.best_params_))

In [None]:
# Grid Search for Algorithm Tuning
import numpy as np
from sklearn.model_selection import GridSearchCV

# prepare a range of alpha values to test
alphas = list(np.arange(0.1,5,0.2))

# fit the model with alpha parameter grid
model = MultinomialNB()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(X_train_dtm, y_train)
y_predicted_class=grid.predict(X_dev_dtm)
print("Best parameters: "+str(grid.best_params_))
# view the results of the grid search
print("Best accuracy: "+str(grid.best_score_))
print("Best estimator Alpha: "+str(grid.best_estimator_.alpha))

In [None]:
# randomized search to look for parameter combination for the Logistic Regression 
import scipy
from scipy.stats import uniform as sp_rand
from sklearn.model_selection import RandomizedSearchCV

# sample values of C from a distribution and including other parameters in the grid
param_grid = {'C': scipy.stats.expon(scale=100),'class_weight':['balanced', None], 'max_iter':[50,100,200],'fit_intercept':[True,False]}

# fit the model using a randomized parameter grid
model = LogisticRegression()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=300)
rsearch.fit(X_train_dtm, y_train)
y_predicted_class=rsearch.predict(X_dev_dtm)

# view the results
print("Best accuracy: "+str(rsearch.best_score_))
print("Best estimator C: "+str(rsearch.best_estimator_.C))
print("Best parameters: "+str(rsearch.best_params_))


In [None]:
# randomized search to look for parameter combination for the Multinomial NB
from scipy.stats import uniform as sp_rand
from sklearn.model_selection import RandomizedSearchCV

# sample the parameter alpha
param_grid = {'alpha': sp_rand()}

# fit the model using a randomized parameter grid
model = MultinomialNB()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=300)
rsearch.fit(X_train_dtm, y_train)
y_predicted_class=rsearch.predict(X_dev_dtm)

# view the results
print("Best accuracy: "+str(rsearch.best_score_))
print("Best estimator Alpha: "+str(rsearch.best_estimator_.alpha))
print("Best parameters: "+str(rsearch.best_params_))

In [None]:
# using LogisticRegressionCV for customized grid search in model LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

# create the model with different parameters. Fit and predict.
model = LogisticRegressionCV(
    Cs=list(np.arange(0.1, 10, 0.5))
    ,penalty='l2'
    ,random_state=777
    ,max_iter=10000
    ,fit_intercept=True
    ,solver='liblinear'
    ,tol=0.0001
)
model.fit(X_train_dtm, y_train)
y_predicted_class=model.predict(X_dev_dtm)

# view the results
check_results(y_predicted_class,y_dev)

Now, let's compare with the test dataset:

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

# get the test data to run the model
X_train_dtm, y_train=prepare_training_data(get_BOW_lowered)
X_test_dtm,y_test=prepare_test_data(get_BOW_lowered)

# fit a logistic regression model to the data using the best parameters found
model_regression = LogisticRegression(C=best_c, fit_intercept= False)

# train the model using X_train_dtm 
model_regression.fit(X_train_dtm, y_train)

# make predictions with the testing data
y_regression_predicted_class = model_regression.predict(X_test_dtm)



# fit a Naive Bayes model to the data
model_NB = MultinomialNB(alpha=best_alpha)

# train the model using X_train_dtm 
model_NB.fit(X_train_dtm, y_train)
y_nb_predicted_class = model_NB.predict(X_test_dtm)

# summary results
# create a dataframe to put the results
df_model_results=pd.DataFrame(columns=['Model','Accuracy','Macroaveraged F-Score'])
df_model_results.loc[len(df_model_results)]=['LinearRegression',accuracy_score(y_test,y_regression_predicted_class),f1_score(y_test, y_regression_predicted_class, average="macro")]
df_model_results.loc[len(df_model_results)]=['Naive Bayes',accuracy_score(y_test,y_nb_predicted_class),f1_score(y_test, y_nb_predicted_class, average="macro")]

print("********Summary************")
print(df_model_results)
print("***************************")
# results for the tunned logistic regression
print("Logistic Regression - Test")
check_results(y_regression_predicted_class,y_test)

# view the results for the tunned NB classifier
print("Naive Bayes - Test")
check_results(y_nb_predicted_class,y_test)
