In [30]:
import pandas as pd
import numpy as np
import pickle
import csv
from sklearn.linear_model import LogisticRegression

In [31]:
# Get the vocabulary
with open('helpers/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
vocab_size = len(vocab)

In [49]:
with open('data/train_pos.txt', 'r') as f:
    pos_tweets = f.read().splitlines()
with open('data/train_neg.txt', 'r') as f:
    neg_tweets = f.read().splitlines()
embeddings = np.load('helpers/embeddings.npy')

In [87]:
train_tweets = np.array(pos_tweets + neg_tweets)
y = np.array(([1] * len(pos_tweets)) + ([0] * len(neg_tweets)))

In [88]:
# create local validation set
def split_data(x, y, ratio, seed=1):
    """split the dataset based on the split ratio."""
    # set seed
    #np.random.seed(seed)
    # generate random indices
    num_row = len(y)
    indices = np.random.permutation(num_row)
    index_split = int(np.floor(ratio * num_row))
    index_tr = indices[: index_split]
    index_te = indices[index_split:]
    # create split
    x_tr = x[index_tr]
    x_te = x[index_te]
    y_tr = y[index_tr]
    y_te = y[index_te]
    return x_tr, x_te, y_tr, y_te

In [89]:
x_tr, x_te, y_tr, y_te = split_data(train_tweets, y, 0.8)

In [90]:
x_tr_pd = pd.DataFrame({'tweets': x_tr, 'class': y_tr})
pd.set_option('display.max_colwidth', -1)
x_tr_pd.head()

Unnamed: 0,tweets,class
0,<user> <user> <user> who asked miss mcilroy here ?,1
1,"11 "" latex "" just married ! "" balloons ( pack of 12 latex "" just married ! "" balloons . need an inexpensive , fun decora ... <url>",0
2,"oh , jaimie . . this is maya",1
3,<user> noomm ! xd aha not much ; so bored : l aha want to watch a film but don't know which one ahha <3 xxx,0
4,i can't explain how i feel about <user> losing points again at home #comeonandwinthefacup,0


In [91]:
# removing usertags, urls, numbers and special characters
to_remove = ['<user>', '<url>', '[^a-zA-Z#]']
x_tr_pd.tweets = x_tr_pd.tweets.replace(to_remove, ' ', regex=True)
x_tr_pd.head()

Unnamed: 0,tweets,class
0,who asked miss mcilroy here,1
1,latex just married balloons pack of latex just married balloons need an inexpensive fun decora,0
2,oh jaimie this is maya,1
3,noomm xd aha not much so bored l aha want to watch a film but don t know which one ahha xxx,0
4,i can t explain how i feel about losing points again at home #comeonandwinthefacup,0


In [92]:
# remove all words smaller than 3 characters
x_tr_pd.tweets = x_tr_pd.tweets.replace([r'\b\w{,3}\b'], '', regex=True)

In [93]:
# remove stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
x_tr_pd.tweets = x_tr_pd.tweets.apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [94]:
x_tr_pd.head(20)

Unnamed: 0,tweets,class
0,asked miss mcilroy,1
1,latex married balloons pack latex married balloons need inexpensive decora,0
2,jaimie maya,1
3,noomm much bored want watch film know ahha,0
4,explain feel losing points home #comeonandwinthefacup,0
5,hehehe like janoskians georgie,1
6,haha dont knock tried first time everything,1
7,psychoholic friday nightmare darkness satan #mymortuary,1
8,introduction persian revised edition hardcover comprehensive grammar modern classical collo,0
9,anxiety issues presentations,0


In [95]:
freq = pd.Series(' '.join(x_tr_pd.tweets).split()).value_counts()[:10]
print(freq)
popular_words = list(freq.index)

love      8933
like      7413
frame     6369
good      6226
know      6185
follow    5826
please    4962
want      4673
back      4576
today     4297
dtype: int64


In [96]:
# remove popular words
x_tr_pd.tweets = x_tr_pd.tweets.apply(lambda x: " ".join(x for x in x.split() if x not in popular_words))

In [97]:
x_tr_pd.head(20)

Unnamed: 0,tweets,class
0,asked miss mcilroy,1
1,latex married balloons pack latex married balloons need inexpensive decora,0
2,jaimie maya,1
3,noomm much bored watch film ahha,0
4,explain feel losing points home #comeonandwinthefacup,0
5,hehehe janoskians georgie,1
6,haha dont knock tried first time everything,1
7,psychoholic friday nightmare darkness satan #mymortuary,1
8,introduction persian revised edition hardcover comprehensive grammar modern classical collo,0
9,anxiety issues presentations,0


In [146]:
# calculate number of words that appear just once
freq = pd.Series(' '.join(x_tr_pd.tweets).split()).value_counts()
dict_words = freq.to_dict()

In [153]:
x_tr_pd.tweets = x_tr_pd.tweets.apply(lambda x: " ".join(x for x in x.split() if dict_words[x] > 1))

In [154]:
x_tr_pd.head(20)

Unnamed: 0,tweets,class
0,asked miss,1
1,latex married balloons pack latex married balloons need inexpensive decora,0
2,jaimie maya,1
3,much bored watch film ahha,0
4,explain feel losing points home,0
5,hehehe janoskians georgie,1
6,haha dont knock tried first time everything,1
7,friday nightmare darkness satan,1
8,introduction persian revised edition hardcover comprehensive grammar modern classical,0
9,anxiety issues presentations,0


In [156]:
from textblob import Word
x_tr_pd.tweets = x_tr_pd.tweets.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
x_tr_pd.head(20)

Unnamed: 0,tweets,class
0,asked miss,1
1,latex married balloon pack latex married balloon need inexpensive decora,0
2,jaimie maya,1
3,much bored watch film ahha,0
4,explain feel losing point home,0
5,hehehe janoskians georgie,1
6,haha dont knock tried first time everything,1
7,friday nightmare darkness satan,1
8,introduction persian revised edition hardcover comprehensive grammar modern classical,0
9,anxiety issue presentation,0


In [157]:
X = np.array([np.zeros(len(embeddings[0]))])
Y = []

In [158]:
i = 0
for tweet in x_tr_pd.tweets:
    if(i % 10000 == 0): print("iteration %d" % i)
    words_vocab = [vocab.get(word, -1) for word in tweet.split()] # replace with -1 if not in vocab
    words_vocab = [w_ind for w_ind in words_vocab if w_ind > -1] # filter away inexisting words
    average_vector = np.array([np.zeros(20)])
    if len(words_vocab) > 0: # only apply average for tweets which have words in the vocab
        words_emb = [embeddings[w_v] for w_v in words_vocab]
        average_vector = (np.sum(words_emb, axis=0)/len(words_emb)).reshape(1,20)
    X = np.append(X, average_vector, axis=0)
    i += 1
    

iteration 0
iteration 10000
iteration 20000
iteration 30000
iteration 40000
iteration 50000
iteration 60000
iteration 70000
iteration 80000
iteration 90000
iteration 100000
iteration 110000
iteration 120000
iteration 130000
iteration 140000
iteration 150000


In [159]:
X = X[1:]

In [160]:
print(X.shape)

(160000, 20)


In [161]:
X_new = np.ones((X.shape[0],X.shape[1]+1))
X_new[:,:-1] = X
Y = x_tr_pd['class']

In [162]:
lr = LogisticRegression(C=0.1)
lr.fit(X_new, Y)



LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [163]:
lr2 = LogisticRegression(C=1)
lr2.fit(X_new, Y)



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [166]:
m = pd.DataFrame(x_te, columns=['tweets'])
m.tweets = m.tweets.replace(to_remove, ' ', regex=True)
m.tweets = m.tweets.replace([r'\b\w{,3}\b'], '', regex=True)
m.tweets = m.tweets.apply(lambda x: " ".join(x for x in x.split() if x not in stop))
m.tweets = m.tweets.apply(lambda x: " ".join(x for x in x.split() if x not in popular_words))
freq = pd.Series(' '.join(m.tweets).split()).value_counts()
dict_words = freq.to_dict()
m.tweets = m.tweets.apply(lambda x: " ".join(x for x in x.split() if dict_words[x] > 1))
m.tweets = m.tweets.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
m.head()

Unnamed: 0,tweets
0,#biggestregret nothing never regret live thankful experience would past
1,high speed hdmi cable foot meter product quality electronics accessory offered great
2,mcfly last week came obsessed worry else like
3,robyn sleeping broken heart radio live lounge
4,quick become


In [167]:
X_test = np.array([np.zeros(len(embeddings[0]))])

In [168]:
# remake test tweets into word embeddings
i = 0
for tweet in m.tweets:
    if(i % 10000 == 0): print("iteration %d" % i)
    words_vocab = [vocab.get(word, -1) for word in tweet.split()] # replace with -1 if not in vocab
    words_vocab = [w_ind for w_ind in words_vocab if w_ind > -1] # filter away inexisting words
    average_vector = np.array([np.zeros(20)])
    if len(words_vocab) > 0:
        words_emb = [embeddings[w_v] for w_v in words_vocab]
        average_vector = (np.sum(words_emb, axis=0)/len(words_emb)).reshape(1,20)
    X_test = np.append(X_test, average_vector, axis=0) 
    i += 1

iteration 0
iteration 10000
iteration 20000
iteration 30000


In [169]:
X_test = X_test[1:]

In [170]:
X_test_new = np.ones((X_test.shape[0],X_test.shape[1]+1))
X_test_new[:,:-1] = X_test

In [171]:
y_test = lr.predict(X_test_new)
y_test2 = lr2.predict(X_test_new)

In [172]:
def accuracy(y1, y2):
    indices = np.arange(len(y1))[y1==y2]
    return len(indices)/len(y1)

In [173]:
print(accuracy(y_test, y_te))
print(accuracy(y_test2, y_te))

0.5592
0.5592


# Try on actual test data

In [174]:
with open('data/test_data.txt', 'r') as f:
    test_tweets = f.read().splitlines()

In [177]:
m = pd.DataFrame(test_tweets, columns=['tweets'])
m.tweets = m.tweets.replace(to_remove, ' ', regex=True)
m.tweets = m.tweets.replace([r'\b\w{,3}\b'], '', regex=True)
m.tweets = m.tweets.apply(lambda x: " ".join(x for x in x.split() if x not in stop))
m.tweets = m.tweets.apply(lambda x: " ".join(x for x in x.split() if x not in popular_words))
freq = pd.Series(' '.join(m.tweets).split()).value_counts()
dict_words = freq.to_dict()
m.tweets = m.tweets.apply(lambda x: " ".join(x for x in x.split() if dict_words[x] > 1))
m.tweets = m.tweets.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
m.head()

Unnamed: 0,tweets
0,scooter sport portable stay longer water
1,well work week come cheer battery
2,cant stay away thats baby
3,perfectly fine anymore lmao
4,whenever fall asleep watching always wake headache


In [178]:
test_X = np.array([np.zeros(len(embeddings[0]))])

In [179]:
# remake test tweets into word embeddings
for tweet in m.tweets:
    words_vocab = [vocab.get(word, -1) for word in tweet.split()] # replace with -1 if not in vocab
    words_vocab = [w_ind for w_ind in words_vocab if w_ind > -1] # filter away inexisting words
    average_vector = np.array([np.zeros(20)])
    if len(words_vocab) > 0:
        words_emb = [embeddings[w_v] for w_v in words_vocab]
        average_vector = (np.sum(words_emb, axis=0)/len(words_emb)).reshape(1,20)
    test_X = np.append(test_X, average_vector, axis=0) 

In [180]:
test_X = test_X[1:]

In [181]:
X_final = np.ones((test_X.shape[0], test_X.shape[1]+1))
X_final[:,:-1] = test_X

In [182]:
ja = lr2.predict(X_final)

In [183]:
with open('test_result.csv', 'w') as csvfile:
    tempwriter = csv.writer(csvfile)
    tempwriter.writerow(["Id","Prediction"])
    count = 1
    for row in ja:
        if row == 0:
            row = -1
        tempwriter.writerow([count,str(row)])
        count += 1