WORD2VEC + WORD N GRAMS + CHAR N GRAMS + LINGUISTIC CHARACTERISTICS

In [1]:
import cython
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [2]:
def tweet_to_words( raw_tweet ):
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_tweet).get_text() 
    # 2. Remove non-letters        
    #letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    # 3. Convert to lower case, split into individual words
    words = review_text.lower().split()   
    #words = letters_only.lower().split()            
    # 4. In Python, searching a set is much faster than searching
    stops = set(stopwords.words("english")) 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]  
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))  

import nltk.data

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def tweet_to_sentences( tweet, tokenizer, remove_stopwords=False ):
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(tweet.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call tweet_to_wordlist to get a list of words
            sentences.append( tweet_to_wordlist( raw_sentence, \
              remove_stopwords ))
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [3]:
# Load the model that we created 
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import numpy as np
model = KeyedVectors.load("model_youtube")



In [4]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector  
   
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [5]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
data=pd.read_csv("labeled_data.csv",encoding="latin-1")

In [7]:
num_tweets = data["tweet"].size
clean_train_tweets = []
for i in range( 0, num_tweets ):
    if( (i+1)%10000 == 0 ):
        print ("Tweet %d of %d\n" % ( i+1, num_tweets ))                                                                    
    clean_train_tweets.append( tweet_to_words( data.iloc[i]["tweet"]))

Tweet 10000 of 121054

Tweet 20000 of 121054

Tweet 30000 of 121054

Tweet 40000 of 121054

Tweet 50000 of 121054

Tweet 60000 of 121054



  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


Tweet 70000 of 121054

Tweet 80000 of 121054

Tweet 90000 of 121054

Tweet 100000 of 121054

Tweet 110000 of 121054

Tweet 120000 of 121054



In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
#initialize the count vectorizer
vectorizer = TfidfVectorizer(analyzer = "word",   \
                             ngram_range=(1, 3),  \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 300,  \
                             max_df = 0.85) 
train_data_features = vectorizer.fit_transform(clean_train_tweets)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
#initialize the count vectorizer
vectorizer = TfidfVectorizer(analyzer = "char",   \
                             ngram_range=(3, 6),  \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 300,  \
                             max_df = 0.85) 
train_data_char_features = vectorizer.fit_transform(clean_train_tweets)

In [10]:
train_data_features = train_data_features.toarray()

In [11]:
train_data_char_features = train_data_char_features.toarray()

In [12]:
tweets=data.tweet

In [13]:
stopwords=stopwords = nltk.corpus.stopwords.words("english")

from nltk.stem.porter import *

other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)

stemmer = PorterStemmer()


def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    return parsed_text

def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
    return tweet.split()

In [17]:
def count_twitter_objs(text_string):
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    emoticons_regex = '[\U0001f600-\U0001f650]'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    parsed_text = re.sub(emoticons_regex, 'EMOTICONHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'),parsed_text.count('EMOTICONHERE'))

In [18]:
def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

In [19]:
def features(tweet):
    words = preprocess(tweet) #Get text only
    syllables_all=0
    for word in words:
        syllables = syllable_count(words)
        syllables_all = syllables_all+syllables
        
    num_chars = sum(len(w) for w in words)
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables_all+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet)
    retweet = 0
    if "rt" in words:
        retweet = 1
    
    features = [FKRA, FRE,syllables_all, avg_syl, num_chars, num_chars_total, num_terms, num_words,num_unique_terms,
               twitter_objs[2], twitter_objs[1],twitter_objs[0],twitter_objs[3], retweet]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array(tweets):
    feats=[]
    for t in tweets:
        feats.append(features(t))
    return np.array(feats)

In [20]:
features_names = ["FKRA", "FRE","num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words","num_hashtags", "num_mentions", "num_urls", "num_emoticons","is_retweet"]

In [21]:
feats = get_feature_array(tweets)

In [22]:
import numpy as np
# get document level embeddings
num_features = 300
embeddings = averaged_word_vectorizer(corpus=clean_train_tweets, model=model, num_features=num_features)

  if __name__ == '__main__':


In [23]:
M=np.concatenate([embeddings,train_data_features,train_data_char_features,feats],axis=1)

In [24]:
X = pd.DataFrame(M)
y = data['class'].astype(int)

In [25]:
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=69, test_size=0.33)

In [27]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel

Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression
pipe = Pipeline(
        [('select', SelectFromModel(LogisticRegression(class_weight='balanced',
                                                  penalty="l1", C=0.01))),
        ('model', LogisticRegression(class_weight='balanced',penalty='l2'))])

In [29]:
param_grid = [{}] # Optionally add parameters here
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] ................................................. , total= 3.9min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.9min remaining:    0.0s


[CV] ................................................. , total= 2.7min
[CV]  ................................................................
[CV] ................................................. , total= 4.4min
[CV]  ................................................................
[CV] ................................................. , total= 2.4min
[CV]  ................................................................
[CV] ................................................. , total= 4.0min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 17.4min finished


In [43]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

             precision    recall  f1-score   support

          0       0.84      0.59      0.69      8186
          1       0.89      0.81      0.85     14805
          2       0.77      0.94      0.85     16957

avg / total       0.83      0.82      0.82     39948



Random Forest

In [31]:
from sklearn.ensemble import RandomForestClassifier
pipe = Pipeline(
        [('select', SelectFromModel(LogisticRegression(class_weight='balanced',
                                                  penalty="l1", C=0.01))),
        ('model', RandomForestClassifier(n_estimators=300, random_state=0))])

In [32]:
param_grid = [{}] # Optionally add parameters here
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] ................................................. , total= 6.4min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.6min remaining:    0.0s


[CV] ................................................. , total= 7.0min
[CV]  ................................................................
[CV] ................................................. , total= 3.0min
[CV]  ................................................................
[CV] ................................................. , total= 5.3min
[CV]  ................................................................
[CV] ................................................. , total= 7.7min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 30.3min finished


In [33]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

             precision    recall  f1-score   support

          0       0.86      0.70      0.77      8186
          1       0.90      0.88      0.89     14805
          2       0.83      0.92      0.87     16957

avg / total       0.86      0.86      0.86     39948



Linear SVC

In [34]:
pipe = Pipeline(
        [('select', SelectFromModel(LogisticRegression(class_weight='balanced',
                                                  penalty="l1", C=0.01))),
        ('model', LinearSVC(C=0.05,random_state=0))])

In [35]:
param_grid = [{}] # Optionally add parameters here
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] ................................................. , total= 7.9min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.9min remaining:    0.0s


[CV] ................................................. , total= 3.0min
[CV]  ................................................................
[CV] ................................................. , total= 7.2min
[CV]  ................................................................
[CV] ................................................. , total= 4.9min
[CV]  ................................................................
[CV] ................................................. , total= 4.8min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 27.8min finished


In [36]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

             precision    recall  f1-score   support

          0       0.82      0.34      0.48      8186
          1       0.43      0.91      0.59     14805
          2       0.68      0.22      0.33     16957

avg / total       0.62      0.50      0.46     39948



Extra Trees

In [37]:
from sklearn.tree import ExtraTreeClassifier
pipe = Pipeline(
        [('select', SelectFromModel(LogisticRegression(class_weight='balanced',
                                                  penalty="l1", C=0.01))),
        ('model', ExtraTreeClassifier())])

In [38]:
param_grid = [{}] # Optionally add parameters here
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] ................................................. , total= 5.1min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.1min remaining:    0.0s


[CV] ................................................. , total= 2.1min
[CV]  ................................................................
[CV] ................................................. , total= 3.9min
[CV]  ................................................................
[CV] ................................................. , total= 4.1min
[CV]  ................................................................
[CV] ................................................. , total= 2.6min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 17.9min finished


In [39]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

             precision    recall  f1-score   support

          0       0.70      0.71      0.71      8186
          1       0.86      0.87      0.86     14805
          2       0.83      0.82      0.83     16957

avg / total       0.82      0.82      0.82     39948



Naive Bayes

In [40]:
from sklearn.naive_bayes import BernoulliNB
pipe = Pipeline(
        [('select', SelectFromModel(LogisticRegression(class_weight='balanced',
                                                  penalty="l1", C=0.01))),
        ('model', BernoulliNB())])

In [41]:
param_grid = [{}] # Optionally add parameters here
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] ................................................. , total=10.1min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 10.1min remaining:    0.0s


[CV] ................................................. , total= 2.2min
[CV]  ................................................................
[CV] ................................................. , total= 2.8min
[CV]  ................................................................
[CV] ................................................. , total= 2.8min
[CV]  ................................................................
[CV] ................................................. , total= 3.9min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 21.9min finished


In [42]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

             precision    recall  f1-score   support

          0       0.84      0.59      0.69      8186
          1       0.89      0.81      0.85     14805
          2       0.77      0.94      0.85     16957

avg / total       0.83      0.82      0.82     39948

