Import libaries

In [141]:
import pandas as pd
import numpy as np
import gensim, logging, nltk, re
from sklearn.linear_model import LogisticRegression

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Read data and drop NAs

In [190]:
tweets_train = pd.read_csv("data/train.csv", sep=",", header=0, index_col=None, na_values='Not Available').dropna()
tweets_test = pd.read_csv("data/test.csv", sep=",", header=0, index_col=None, na_values='Not Available').dropna()
tweets_test.rename(columns = {'Category':'Tweet'}, inplace = True)

Tokenisation function and helpers

In [215]:
stopwords = ["a", "about", "after", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been",
            "before", "being", "between", "both", "by", "could", "did", "do", "does", "doing", "during", "each",
            "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here",
            "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've",
            "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "of",
            "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "own", "shan't", "she", "she'd",
            "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs",
            "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're",
            "they've", "this", "those", "through", "to", "until", "up", "very", "was", "wasn't", "we", "we'd",
            "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's",
            "which", "while", "who", "who's", "whom", "with", "would", "you", "you'd", "you'll", "you're", "you've",
            "your", "yours", "yourself", "yourselves", "above", "again", "against", "aren't", "below", "but", "can't",
            "cannot", "couldn't", "didn't", "doesn't", "don't", "down", "few", "hadn't", "hasn't", "haven't", "if",
            "isn't", "mustn't", "not", "off", "out", "over", "shouldn't", "same", "too", "under", "why",
            "why's", "won't", "wouldn't"]
#stopwords += ["no", "nor"]

RE_HTTP = re.compile("http(s)?://[/\.a-z0-9]+")

def prepare_text(text):
    replacers = {'&nbsp;': ' ', '&lt;': '<', '&gt;': '>', '&amp;': '&', '&pound;': u'£', 
                 '&euro;': u'€', '&copy;': u'©', '&reg;': u'®'}
    for r in replacers.keys():
        text = text.replace(r, replacers[r])
    return text

def tokenize_tweet(text):
    # replace characters
    text = prepare_text(text.lower())
    #tokenize
    tknz = nltk.tokenize.TweetTokenizer()
    tokens = tknz.tokenize(text)
    # drop urls
    tokens = [token for token in tokens if not RE_HTTP.search(token)]
    # drop stopwords
    #tokens = [token for token in tokens if token not in stopwords]
    return tokens

Tokenize

In [216]:
train_tokenized = [tokenize_tweet(x) for x in tweets_train['Tweet']]
test_tokenized = [tokenize_tweet(x) for x in tweets_test['Tweet']]

In [217]:
[print(x) for x in train_tokenized]

['ios', '9', 'app', 'transport', 'security', '.', 'mm', 'need', 'to', 'check', 'if', 'my', '3rd', 'party', 'network', 'pod', 'supports', 'it']
['mar', 'if', 'you', 'have', 'an', 'ios', 'device', ',', 'you', 'should', 'download', 'our', 'app', 'too', ':']
['@jimmie_vanagon', 'my', 'phone', 'does', 'not', 'run', 'on', 'latest', 'ios', 'which', 'may', 'account', 'for', 'problem', 'the', 'other', 'day', '..', 'time', 'it', 'was', 'replaced']
['not', 'sure', 'how', 'to', 'start', 'your', 'publication', 'on', 'ios', '?', "we'll", 'be', 'live', 'helping', 'with', 'ask', 'me', 'anything', 'sessions', 'today', 'and', 'friday']
['two', 'dollar', 'tuesday', 'is', 'here', 'with', 'forklift', '2', ',', 'quickkey', 'for', 'ios', 'and', 'suite', 'for', 'pages', 'for', 'just', '$', '1.99', 'today', ':']
['if', "you're", 'not', 'already', 'signed', 'up', 'to', 'test', 'my', 'ios', 'game', ',', 'nows', 'your', 'chance', '!']
['youtube', 'gaming', 'officially', 'launches', 'on', 'web', ',', 'android', ',

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

Create model

In [218]:
model = gensim.models.Word2Vec(train_tokenized, iter=20, min_count=3)
model.init_sims(replace=True)

2017-01-15 22:53:31,829 : INFO : collecting all words and their counts
2017-01-15 22:53:31,831 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-15 22:53:31,900 : INFO : collected 14267 word types from a corpus of 118612 raw words and 5422 sentences
2017-01-15 22:53:31,906 : INFO : Loading a fresh vocabulary
2017-01-15 22:53:31,947 : INFO : min_count=3 retains 3432 unique words (24% of original 14267, drops 10835)
2017-01-15 22:53:31,951 : INFO : min_count=3 leaves 106027 word corpus (89% of original 118612, drops 12585)
2017-01-15 22:53:31,984 : INFO : deleting the raw counts dictionary of 14267 items
2017-01-15 22:53:31,987 : INFO : sample=0.001 downsamples 57 most-common words
2017-01-15 22:53:31,989 : INFO : downsampling leaves estimated 78780 word corpus (74.3% of prior 106027)
2017-01-15 22:53:31,991 : INFO : estimated required memory for 3432 words and 100 dimensions: 4461600 bytes
2017-01-15 22:53:32,020 : INFO : resetting layer weights
2017-01-

In [203]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        return np.zeros(wv.layer_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, text) for text in text_list])

In [204]:
X_train_word_average = word_averaging_list(model.wv,train_tokenized)
X_test_word_average = word_averaging_list(model.wv,test_tokenized)

In [205]:
logreg = LogisticRegression(n_jobs=1, C=1e5)

logreg = logreg.fit(X_train_word_average, tweets_train['Category'])
predicted = logreg.predict(X_test_word_average)

In [206]:
[print(x) for x in predicted]

positive
positive
positive
neutral
neutral
positive
positive
neutral
positive
neutral
positive
neutral
neutral
positive
positive
neutral
neutral
negative
neutral
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
neutral
negative
neutral
neutral
positive
positive
positive
positive
neutral
positive
positive
neutral
positive
neutral
positive
positive
positive
positive
positive
neutral
positive
positive
neutral
positive
neutral
positive
positive
neutral
neutral
positive
neutral
positive
positive
neutral
positive
neutral
positive
negative
positive
positive
positive
positive
positive
neutral
positive
positive
negative
positive
positive
positive
positive
positive
positive
positive
neutral
positive
positive
neutral
positive
neutral
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
neutral
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
po

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [219]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
tr, te = train_test_split(tweets_train, test_size=0.1, random_state=42)
trt = [tokenize_tweet(x) for x in tr['Tweet']]
tet = [tokenize_tweet(x) for x in te['Tweet']]
xtra = word_averaging_list(model.wv,trt)
xtea = word_averaging_list(model.wv,tet)
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(xtra, tr['Category'])
predicted = logreg.predict(xtea)
print('Trafność klasyfikacji %s' % accuracy_score(te['Category'], predicted))
cm = confusion_matrix(te['Category'], predicted)
print('Macierz pomyłek\n %s' % cm)

from scipy.sparse import csr_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
classifier = RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=23)
classifier.fit(xtra, tr['Category'])
predicted2 = classifier.predict(xtea)
list_of_labels = list(set(te['Category']))
print("=================== Results ===================")
print("            Positive    Neutral     Negative   ")
print("F1       ", f1_score(te['Category'], predicted, average=None, pos_label=None, labels=list_of_labels))
print("Precision", precision_score(te['Category'], predicted, average=None, pos_label=None, labels=list_of_labels))
print("Recall   ", recall_score(te['Category'], predicted, average=None, pos_label=None, labels=list_of_labels))

Trafność klasyfikacji 0.591160220994
Macierz pomyłek
 [[ 17  44  23]
 [ 10 109  80]
 [  6  59 195]]
            Positive    Neutral     Negative   
F1        [ 0.53041363  0.69892473  0.29059829]
Precision [ 0.51415094  0.65436242  0.51515152]
Recall    [ 0.54773869  0.75        0.20238095]
