In [20]:
import numpy as np
import csv
import pandas as pd
import pickle
import string
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import re
import spacy

In [6]:
nlp = spacy.load('en_core_web_lg')

In [13]:
raw_data_neg = pd.read_csv('./twitter-datasets/train_neg.txt', header=None, sep="\n", encoding='utf-8', names=['text'],
                               error_bad_lines=False, warn_bad_lines=False, quoting=csv.QUOTE_NONE).drop_duplicates()
raw_data_neg['label'] = 0

raw_data_pos = pd.read_csv('./twitter-datasets/train_pos.txt', header=None, sep="\n", encoding='utf-8', names=['text'],
                               error_bad_lines=False, warn_bad_lines=False, quoting=csv.QUOTE_NONE).drop_duplicates()
raw_data_pos['label'] = 1

raw_data_train = pd.concat([raw_data_neg, raw_data_pos], ignore_index=True)


raw_data_test = pd.read_csv('./twitter-datasets/test_data.txt', header=None, sep="\n", encoding='utf-8', names=['text'],
                               error_bad_lines=False, warn_bad_lines=False, quoting=csv.QUOTE_NONE).drop_duplicates()


raw_data_test['id'], raw_data_test['text'] = raw_data_test['text'].apply(lambda x: x.split(',')[0]), raw_data_test['text'].apply(lambda x: ','.join(x.split(',')[1:]))

raw_data_train.head(20)

Unnamed: 0,text,label
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,0
1,glad i dot have taks tomorrow ! ! #thankful #s...,0
2,1-3 vs celtics in the regular season = were fu...,0
3,<user> i could actually kill that girl i'm so ...,0
4,<user> <user> <user> i find that very hard to ...,0


# Preprocessing

In [14]:
# Use dictionary from http://luululu.com/tweet/typo-corpus-r1.txt
# http://people.eng.unimelb.edu.au/tbaldwin/etc/emnlp2012-lexnorm.tgz
# to handle abbreviations, mistakes...etc. (IN )

# LULU-CORPUS
# (1) INSERT (IN): a character is added to the original word.
# (2) REMOVE (RM): a character is removed from the original word.
# (3) REPLACE1 (R1): the order of character is different from the original word (the number of differences is one).
# (4) REPLACE2 (R2): a character is different from the original word

final_corpus = {}

def corpusReplace(corpus):
    for word in corpus:
        word = word.decode('utf8')
        word = word.split()
        final_corpus[word[0]] = word[1]    

corpus_lulu = open('corpus/lulu-corpus.txt', 'rb')
corpusReplace(corpus_lulu)
corpus_lulu.close()

corpus_emnlp = open('corpus/emnlp-corpus.txt', 'rb')
corpusReplace(corpus_emnlp)
corpus_emnlp.close()

def applyCorpus(tweet):
    new_tweet = ''
    for w in tweet.split(' '):
        if w in final_corpus.keys():
            #Replace with correct value
            new_word = final_corpus[w]
            new_tweet = new_tweet + ' ' + new_word
        else:
            new_tweet = new_tweet + ' ' + w
    return new_tweet

raw_data_train['text'] = raw_data_train.text.apply(applyCorpus)   
         
def cleanTweet(tweet):
    tweet = re.sub('<url>','',tweet)
    tweet = re.sub('<user>','',tweet)
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = tweet.lower()
    tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
    tweet = re.sub(r'#\w*', '', tweet) #hashtag
    tweet = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', tweet) # puntuaction
    tweet = re.sub(r'\s\s+', ' ', tweet)
    tweet = tweet.lstrip(' ') 
    tweet = ''.join(c for c in tweet if c <= '\uFFFF') 
    return tweet

raw_data_train['text'] = raw_data_train.text.apply(cleanTweet)

In [16]:
raw_data_train.head(20)

Unnamed: 0,text,label
0,vinco tresorpack 6 difficulty 10 of 10 object ...,0
1,glad i dot have takes tomorrow,0
2,1 3 as celtics in the regular season were fuck...,0
3,i could actually kill that girl i m so sorry,0
4,i find that very hard to believe is afraid,0
5,wish i could be out all night tonight,0
6,i got kicked out the wgm,0
7,it yes she is you tell it my lips are closed okay,0
8,why is she so perfect,0
9,hi harry did you have good time in us i didnt ...,0


# Feature extraction

In [21]:
def ngrams(tokens):
    # onegrams = tokens
    bigrams = [' '.join(t) for t in list(zip(tokens, tokens[1:]))]
    trigrams = [' '.join(t) for t in list(zip(tokens, tokens[1:], tokens[2:]))]
    return bigrams + trigrams

# example of use
# data = data.copy()
# data['grams'] = data.tokens.apply(ngrams)


# tokenize helper function
def tokenize(tweets):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in list(tweets) if char not in string.punctuation]
    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.lower().split() if word.lower() not in stopwords.words('english')]

raw_data_train = raw_data_train.copy()
raw_data_train['token'] = raw_data_train.text.apply(tokenize)

In [22]:
raw_data_train

Unnamed: 0,text,label,token
0,vinco tresorpack 6 difficulty 10 of 10 object ...,0,"[vinco, tresorpack, 6, difficulty, 10, 10, obj..."
1,glad i dot have takes tomorrow,0,"[glad, dot, takes, tomorrow]"
2,1 3 as celtics in the regular season were fuck...,0,"[1, 3, celtics, regular, season, fucked, play,..."
3,i could actually kill that girl i m so sorry,0,"[could, actually, kill, girl, sorry]"
4,i find that very hard to believe is afraid,0,"[find, hard, believe, afraid]"
5,wish i could be out all night tonight,0,"[wish, could, night, tonight]"
6,i got kicked out the wgm,0,"[got, kicked, wgm]"
7,it yes she is you tell it my lips are closed okay,0,"[yes, tell, lips, closed, okay]"
8,why is she so perfect,0,[perfect]
9,hi harry did you have good time in us i didnt ...,0,"[hi, harry, good, time, us, didnt, get, 2, see..."


In [None]:
# https://github.com/Wronskia/Sentiment-Analysis-on-Twitter-data/blob/master/Final/features.py
# def ngramFeatureAugmentation(tweets_tfidf):
#     new_tweets = []
#     for t in tweets:
        


# def add_ngram(sequences, token_indice, ngram_range=2):
#     """
#     Augment the input list of list (sequences) by appending n-grams values.
#     Example: adding bi-gram
#     >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
#     >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
#     >>> add_ngram(sequences, token_indice, ngram_range=2)
#     [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
#     Example: adding tri-gram
#     >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
#     >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
#     >>> add_ngram(sequences, token_indice, ngram_range=3)
#     [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
#     """
#     new_sequences = []
#     for input_list in sequences:
#         new_list = input_list[:]
#         for i in range(len(new_list)-ngram_range+1):
#             for ngram_value in range(2, ngram_range+1):
#                 ngram = tuple(new_list[i:i+ngram_value])
#                 if ngram in token_indice:
#                     new_list.append(token_indice[ngram])
#         new_sequences.append(new_list)

#     return new_sequences

# Train