<h1>1. Preprocessing Methods</h1>

In [2]:
import re, string, time, pandas, numpy, enchant
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag

enchant_dict = enchant.Dict("en_US")
stopwords = list(stopwords.words('english'))+list(string.punctuation)
tokenizer = TweetTokenizer()
url_re = re.compile(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+|www.[^ ]+',re.VERBOSE | re.IGNORECASE)
mention_re = r'@[A-Za-z0-9]+'
hashtag_re = r'#[A-Za-z0-9]+'

def denoise(raw_tweet):
    if type(raw_tweet)==list:
        return [denoise(t) for t in raw_tweet]
    # Remove URLs and mentions
    raw_tweet = re.sub(url_re, ' ', raw_tweet)
    raw_tweet = re.sub(mention_re, ' ', raw_tweet)
    # TODO: anything with hashtags?
    # TODO: number normalization? (see normalize_number_todo.txt)
    # TODO: condense contractions?
    return raw_tweet

def tokenize(raw_tweet, stopwords=stopwords):
    if type(raw_tweet)==list:
        return [tokenize(t) for t in raw_tweet]
    return [token.lower() for token in tokenizer.tokenize(raw_tweet) if len(token)>0 and token.lower() not in stopwords]

def lemmatize(tokens, stopwords=stopwords):
    if len(tokens)!=0 and type(tokens[0])==list:
        return [lemmatize(t) for t in tokens]
    lemmatized_tokens = []
    for token, tag in pos_tag(tokens):
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatizer = WordNetLemmatizer()
        #stemmer = PorterStemmer()
        #token = stemmer.stem(token)
        if len(token)>0 and token.lower() not in stopwords:
            lemmatized_tokens.append(lemmatizer.lemmatize(token, pos))
    return lemmatized_tokens

def only_english(tokens):
    if len(tokens)!=0 and type(tokens[0])==list:
        return [only_english(t) for t in tokens]
    return [e for e in tokens if enchant_dict.check(e)]

<h1>2. Feature Extraction Methods</h1>

In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy_fun(doc):
    return doc

def vectorize(tokenized_tweets):
    tfidf = TfidfVectorizer(
        analyzer='word',
        tokenizer=dummy_fun,
        preprocessor=dummy_fun,
        token_pattern=None,
        min_df=10,
        max_df=0.9,
        ngram_range=(1,2)
    )

    tfidf.fit(tokenized_tweets)
    return tfidf

def extract_polarity_features(dataset):
    #Split tweets into training and test sets with even portions of positive to negative tweets
    df = dataset.drop(dataset[dataset.airline_sentiment=="neutral"].index)
    
    positive_tweets = df[df.airline_sentiment=="positive"]
    negative_tweets = df[df.airline_sentiment=="negative"]

    positive_tweets_train = positive_tweets.sample(frac=0.5)
    positive_tweets_test = positive_tweets.drop(positive_tweets_train.index)

    negative_tweets_train = negative_tweets.sample(frac=0.5)
    negative_tweets_test = negative_tweets.drop(negative_tweets_train.index)

    tweets_train = pandas.concat([positive_tweets_train, negative_tweets_train])
    tweets_test = pandas.concat([positive_tweets_test, negative_tweets_test])
    tfidf=vectorize(dataset.tokens)
    X_train = tfidf.transform(tweets_train.lemmatized_tokens).toarray()
    y_train = [1 if label=="positive" else 0 for label in tweets_train.airline_sentiment]
    X_test = tfidf.transform(tweets_test.lemmatized_tokens).toarray()
    y_test = [1 if label=="positive" else 0 for label in tweets_test.airline_sentiment]
    
    return X_train, y_train, X_test, y_test

def extract_subjectivity_features(dataset):
    df = dataset.assign(subjective=dataset.airline_sentiment.apply(lambda s: 0 if s=="neutral" else 1))
    subjective_tweets = df[df.subjective==1]
    objective_tweets = df[df.subjective==0]

    subjective_tweets_train = subjective_tweets.sample(frac=0.8)
    subjective_tweets_test = subjective_tweets.drop(subjective_tweets_train.index)

    objective_tweets_train = objective_tweets.sample(frac=0.8)
    objective_tweets_test = objective_tweets.drop(objective_tweets_train.index)

    tweets_train = pandas.concat([subjective_tweets_train, objective_tweets_train])
    tweets_test = pandas.concat([subjective_tweets_test, objective_tweets_test])
    tfidf=vectorize(dataset.tokens)
    X_train = tfidf.transform(tweets_train.lemmatized_tweet)
    y_train = tweets_train.subjective
    X_test = tfidf.transform(tweets_test.lemmatized_tweet).toarray()
    y_test = tweets_test.subjective
    return X_train, y_train, X_test, y_test

<h1>3. Load dataset</h1>

In [89]:
start = time.time()
dataset = pandas.read_csv('https://raw.githubusercontent.com/kolaveridi/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv', encoding='latin-1')
dataset = dataset.assign(denoised_text = denoise(list(dataset.text.astype(str))))
dataset = dataset.assign(tokens = tokenize(list(dataset.denoised_text.astype(str))))
dataset = dataset.assign(lemmatized_tokens = lemmatize(list(dataset.tokens)))
dataset = dataset.assign(english_lemmatized_tokens = only_english(list(dataset.lemmatized_tokens)))
print("Took " + str(time.time()-start) + " seconds to load and preprocess tweets")
dataset

Took 89.28535342216492 seconds to load and preprocess tweets


Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,denoised_text,tokens,lemmatized_tokens,english_lemmatized_tokens
0,570306133677760513,neutral,1.0000,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada),What said.,[said],[say],[say]
1,570301130888122368,positive,0.3486,,0.0000,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada),plus you've added commercials to the experie...,"[plus, added, commercials, experience, ..., ta...","[plus, added, commercial, experience, ..., tacky]","[plus, added, commercial, experience, ..., tacky]"
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada),I didn't today... Must mean I need to take a...,"[today, ..., must, mean, need, take, another, ...","[today, ..., must, mean, need, take, another, ...","[today, ..., must, mean, need, take, another, ..."
3,570301031407624196,negative,1.0000,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada),"it's really aggressive to blast obnoxious ""e...","[really, aggressive, blast, obnoxious, enterta...","[really, aggressive, blast, obnoxious, enterta...","[really, aggressive, blast, obnoxious, enterta..."
4,570300817074462722,negative,1.0000,Can't Tell,1.0000,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada),and it's a really big bad thing about it,"[really, big, bad, thing]","[really, big, bad, thing]","[really, big, bad, thing]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14635,569587686496825344,positive,0.3487,,0.0000,American,,KristenReenders,,0,@AmericanAir thank you we got on a different f...,,2015-02-22 12:01:01 -0800,,,thank you we got on a different flight to Ch...,"[thank, got, different, flight, chicago]","[thank, get, different, flight, chicago]","[thank, get, different, flight]"
14636,569587371693355008,negative,1.0000,Customer Service Issue,1.0000,American,,itsropes,,0,@AmericanAir leaving over 20 minutes Late Flig...,,2015-02-22 11:59:46 -0800,Texas,,leaving over 20 minutes Late Flight. No warn...,"[leaving, 20, minutes, late, flight, warnings,...","[leave, 20, minute, late, flight, warning, com...","[leave, 20, minute, late, flight, warning, com..."
14637,569587242672398336,neutral,1.0000,,,American,,sanyabun,,0,@AmericanAir Please bring American Airlines to...,,2015-02-22 11:59:15 -0800,"Nigeria,lagos",,Please bring American Airlines to #BlackBerry10,"[please, bring, american, airlines, #blackberr...","[please, bring, american, airline, #blackberry10]","[please, bring, airline]"
14638,569587188687634433,negative,1.0000,Customer Service Issue,0.6659,American,,SraJackson,,0,"@AmericanAir you have my money, you change my ...",,2015-02-22 11:59:02 -0800,New Jersey,Eastern Time (US & Canada),"you have my money, you change my flight, and...","[money, change, flight, answer, phones, sugges...","[money, change, flight, answer, phone, suggest...","[money, change, flight, answer, phone, suggest..."


<h1>4. Polarity classifier (positive vs. negative)</h1>

In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

X_train, y_train, X_test, y_test = extract_polarity_features(dataset)

polarity_classifier = LogisticRegression()
polarity_classifier.fit(X_train, y_train)
y_pred = polarity_classifier.predict(X_test)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
#TODO: error?
print(confusion_mat, classification_rep)

[[4531   58]
 [ 521  660]]               precision    recall  f1-score   support

           0       0.90      0.99      0.94      4589
           1       0.92      0.56      0.70      1181

    accuracy                           0.90      5770
   macro avg       0.91      0.77      0.82      5770
weighted avg       0.90      0.90      0.89      5770



<h1>5. Subjectivity classifier (objective vs. subjective)</h1>

In [91]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

X_train, y_train, X_test, y_test = extract_subjectivity_features(dataset)

subjectivity_classifier = LogisticRegression()
subjectivity_classifier.fit(X_train, y_train)
y_pred = subjectivity_classifier.predict(X_test)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
#TODO: error?
print(confusion_mat, classification_rep)

AttributeError: 'DataFrame' object has no attribute 'lemmatized_tweet'

<h1>6. Topic Mining</h1>

In [120]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

def extract_topics_lda(words, doc_term_mat, number_topics = 5, number_words = 10):
    # Create and fit the LDA model
    lda = LDA(n_components=number_topics, n_jobs=-1)
    lda.fit(doc_term_mat)
    # Print the topics found by the LDA model
    topics = []
    for topic in lda.components_:
        topics.append([words[i] for i in topic.argsort()[:-number_words - 1:-1]])
    return topics

def load_subjectivity_lexicon():
    posUrl, negUrl = "resources/sentiment_lexicon/positive-words.txt", "resources/sentiment_lexicon/negative-words.txt"
    with open(posUrl, "r") as posFile, open(negUrl, "r") as negFile:
        posText, negText = posFile.read(), negFile.read()
        posLines, negLines = posText.split("\n"), negText.split("\n")
        return posLines[31:], negLines[31:]
    
positive_words, negative_words = load_subjectivity_lexicon()
english_lemmatized_tokens = list(dataset.english_lemmatized_tokens)
neutral_english_lemmatized_tokens = []
for tweet in english_lemmatized_tokens:
    arr = []
    for token in tweet:
        if not(token in positive_words or token in negative_words):
            arr.append(token)
    neutral_english_lemmatized_tokens.append(arr)

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    min_df=50,
    max_df=0.9,
    ngram_range=(1,2)
)

tfidf.fit(neutral_english_lemmatized_tokens)

pos_lemmatized_tokens = list(dataset[dataset.airline_sentiment=="positive"].english_lemmatized_tokens)
pos_lemmatized_tokens
pos_mat = tfidf.transform(pos_lemmatized_tokens)

neg_lemmatized_tokens = dataset[dataset.airline_sentiment=="negative"].english_lemmatized_tokens
neg_mat = tfidf.transform(neg_lemmatized_tokens)

neutral_lemmatized_tokens = dataset[dataset.airline_sentiment=="neutral"].english_lemmatized_tokens
neutral_mat = tfidf.transform(neutral_lemmatized_tokens)

In [121]:
neg_topics = extract_topics_lda(tfidf.get_feature_names(), neg_mat)
neutral_topics = extract_topics_lda(tfidf.get_feature_names(), neutral_mat)
pos_topics = extract_topics_lda(tfidf.get_feature_names(), pos_mat)

In [122]:
neg_topics, pos_topics, neutral_topics

([['bag',
   'flight',
   'late',
   'luggage',
   'late flight',
   'still',
   'connection',
   'plane',
   'make',
   'happen'],
  ['flight',
   'airline',
   'fly',
   'never',
   'gate',
   'time',
   'use',
   'agent',
   'ever',
   'us'],
  ['flight',
   'cancel',
   'cancel flight',
   'get',
   'flight cancel',
   'help',
   'tomorrow',
   'need',
   'go',
   'weather'],
  ['customer',
   'service',
   'customer service',
   'call',
   'phone',
   'get',
   "can't",
   'help',
   'answer',
   'line'],
  ['hour', 'hold', 'wait', 'minute', '2', 'flight', '3', 'min', 'sit', 'day']],
 [['thanks',
   'much',
   'yes',
   'take',
   'see',
   'time',
   'flight',
   'make',
   "can't",
   'get'],
  ['guy',
   'response',
   'thanks',
   'would',
   'know',
   'quick',
   'reply',
   'find',
   'go',
   'never'],
  ['flight',
   'crew',
   'follow',
   'please',
   'get',
   'home',
   'keep',
   'thanks',
   'gate',
   'agent'],
  ['airline',
   'look',
   'always',
   'first',
   '

<h1>7. Load NFCU tweets</h1>

- Normalize tweets in the same way as we normalized airline tweets
    - Denoise, tokenize and lemmatize 
    - Vectorize using the same vocabulary
- Apply subjectivity classifier and label tweets as subjective vs objective
- Apply polarity classifier and label subjective tweets as positive or negative
- Run LDA on objective tweets, subjective tweets, positive tweets and negative tweets

In [3]:
df = pandas.read_csv("../../data/dataset.csv")

In [4]:
df.head(10)

Unnamed: 0,IDNew,SocialNetwork,SenderUserId,FollowersCount,Message,CreatedTime,MessageType,NormalizedMessage
0,4158647000.0,TWITTER,1.150201e+18,99,Hey @NavyFederalHelp @NavyFederal are you guys...,2020-01-30 19:09:07.795,Twitter Mention,hey are you guys compatible with the security ...
1,4158647000.0,TWITTER,1.150201e+18,99,Hey @NavyFederalHelp @NavyFederal are you guys...,2020-01-30 19:09:07.795,Twitter Mention,hey are you guys compatible with the security ...
2,4131474000.0,TWITTER,8.765216e+17,407,@NavyFederal You’re welcome. Tons of locals wi...,2020-01-30 19:07:55.126,Twitter Reply,you are welcome tons of locals with nf in the ...
3,4131474000.0,TWITTER,8.765216e+17,407,@NavyFederal please put a location in Daytona ...,2020-01-30 19:01:02.803,Twitter Mention,please put a location in daytona thanks
4,2066183000.0,TWITTER,940409400.0,1079,@NavyFederal i’m bout SICK of yall.,2020-01-30 17:53:31.435,Twitter Mention,i am bout sick of yall
5,2110529000.0,TWITTER,1465241000.0,109125,Davide Moretti drove his way through the lane ...,2020-01-30 17:20:00.257,Twitter Mention,davide moretti drove his way through the lane ...
6,1985451000.0,TWITTER,419473700.0,3755,@TheNCUA how is it @NavyFederal can disable a...,2020-01-30 17:09:14.505,Twitter Mention,how is it can disable all tabs and disable my ...
7,4162829000.0,TWITTER,1.199326e+18,1,@defenseupdate @IDFSpokesperson @chicagoGDC @H...,2020-01-30 17:00:29.424,Twitter Mention,defense apac pga
8,1985451000.0,TWITTER,419473700.0,3755,Gee Navy Federal Is A Bank Of Be Best Aint it ...,2020-01-30 17:00:11.694,Twitter Mention,gee navy federal is a bank of be best aint it ...
9,1958612000.0,TWITTER,320732800.0,993,@mac_10k @NavyFederal We good here. Thanks but...,2020-01-30 16:37:24.573,Twitter Mention,k we good here thanks but no thanks
