# Pre-processing of the Data

## Imports

In [73]:
from data_handler import *
from helper import *
from tweetToVec import *
import numpy as np
import wordninja

## Load Data

In [6]:
path_to_tweet = 'data/twitter-datasets/'
pos_path = path_to_tweet + 'train_pos.txt'
neg_path = path_to_tweet + 'train_neg_full.txt'
test_path = path_to_tweet + 'test_data.txt'

## Transformation of contractions

In [7]:
contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "I'd": "i would",
    "I'd've": "i would have",
    "I'll": "i will",
    "I'll've": "i will have",
    "I'm": "i am",
    "I've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "workin": "working",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

In [66]:
def transform_contractions(path):
    transformed = []
    keys = list(contractions.keys())
    with open(path) as f:
        all_lines = f.readlines()
        print(len(all_lines))
        for tweet in all_lines:
            c += 1
            aux = tweet
            for word in tweet.split(' '):
                if word in keys:
                    #print("before: " + aux)
                    #print("word: " + word + " new: " + contractions[word])
                    aux = aux.replace(word, contractions[word])
                    #print("after: " + aux)
            transformed.append(aux)
    return transformed

## Tag Deletion

In [70]:
def tag_del(path):
    transformed = []
    max_occur = 100
    with open(path) as f:
        all_lines = f.readlines()
        for tweet in all_lines:
            aux = tweet
            if tweet.contains("<user>"):
                aux = aux.replace("<user>", '', max_occur)
            if tweet.contains("<url>"):
                aux = aux.replace("<url>", '', max_occur)
            transformed.append(aux)
    return transformed

## Hashtag transformation

In [96]:
def transform_hashtag(path):
    transformed = []
    with open(path) as f:
        all_lines = f.readlines()
        for tweet in all_lines:
            aux = tweet
            for word in tweet.split(' '):
                if len(word)>=1:
                    if word[0] == '#':
                        w_aux = "<hashtag> " + ' '.join(wordninja.split(word))
                        aux = aux.replace(word, w_aux)
            transformed.append(aux)
    return transformed    

In [97]:
transform_hashtag(pos_path)

['<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me <hashtag> believe 15\n',
 "because your logic is so dumb , i won't even crop out your name or your photo . tsk . <url>\n",
 '" <user> just put casper in a box ! " looved the battle ! <hashtag> cr a kk bitch',
 "<user> <user> thanks sir > > don't trip lil mama ... just keep doin ya thang !\n",
 'visiting my brother tmr is the bestest birthday gift eveerrr ! ! !\n',
 '<user> yay ! ! <hashtag> life completed . tweet / facebook me to let me know please\n',
 '<user> <hashtag> 1 d next album title : feel for you / rollercoaster of life . song cocept : life , <hashtag> yo lo , becoming famous ? <3 14 <hashtag> follow me plz ! <3 x15\n',
 "workin hard or hardly workin rt <user> at hardee's with my future coworker <user>\n",
 "<user> i saw . i'll be replying in a bit .\n",
 'this is were i belong\n',
 '<user> anddd to cheer <hashtag> nationals 2013 ?\n',
 'we send an invitation 

In [85]:
rah = wordninja.split('#onetwothree')

In [86]:
st = "hello"

In [91]:
st + ' '.join(rah)

'helloone two three'

In [90]:
' '.join(rah)

'one two three'