## Data Processing:

This code takes takes out:
1. capitalizations
2. punctuation characters, 
3. transforms emoticons to EMOTIC_NEG or EMOTIC_POS.
4. corrects words so that instead of goooolll is gol as example.
5. Remove URL
6. Remove mentions preceeding @ 


Data contains tweets with values 0=negative,2=neutral and 4=positive.
We will be only focusing in binary negative,positive. 

Citation for data: Go, A., Bhayani, R. and Huang, L., 2009. Twitter sentiment classification using distant supervision. CS224N Project Report, Stanford, 1(2009), p.12.



In [6]:
### Import packages
import itertools
import re
from nltk.tokenize import WordPunctTokenizer
from string import punctuation
import pandas as pd
import numpy as np 
import os
import nltk
from nltk.corpus import stopwords
import enchant


######METHODS#######


def Process(tweet):
    tok = WordPunctTokenizer()

    '''Used to process each tweet to remove capitalizations, gramatical errors and stop words'''
    stopwordlist = set(stopwords.words("english"))
    d  =enchant.Dict("en_US")
    
    
    #Remove URL links
    tweet=re.sub('https?://[A-Za-z0-9./]+','',tweet)
    
    tweet=tweet.lower()
    tweet=tweet.split()
    
    #Repair gramatically incorrect words 
    for i in range(len(tweet)):
        if d.check(''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet[i]))):
            tweet[i]=''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet[i]))
        else:
            tweet[i]=''.join(''.join(s)[:1] for _, s in itertools.groupby(tweet[i]))
    
    tweet=' '.join(tweet)
    
    #Clean using regular expressions
    tweet = re.sub(r"\'s", "", tweet)
    tweet = re.sub(r"\'ve", "ve", tweet)
    tweet = re.sub(r"n\'t", "nt", tweet)
    tweet = re.sub(r"\'re", "re", tweet)
    tweet = re.sub(r"\'d", "d", tweet)
    tweet = re.sub(r"\'ll", "ll", tweet)
    tweet = re.sub(r"\'nt", "nt", tweet)
    tweet = re.sub(r",", "", tweet)
    tweet = re.sub(r"!", "", tweet)
    tweet = re.sub(r"\(", "", tweet)
    tweet = re.sub(r"\)", "", tweet)
    tweet = re.sub(r"\?", "", tweet)
    tweet = re.sub(r"\s{2,}", " ", tweet)
    #Remove the @ mention
    tweet= re.sub(r'@[A-Za-z0-9]+','',tweet)
    
    #REmove other punctuations
    tweet="".join(c for c in tweet if c not in punctuation)
     
    #Remove hashtag
    tweet= re.sub(r'#[A-Za-z0-9]+','',tweet)
    
    # Remove numbers 
    liste=[word for word in tweet.split() if not word.isnumeric()]
    tweet=' '.join(liste) 
    
    return tweet.strip().lower()



def Process_data(data,label):
    sentences=[]
    labels=[]
    
    '''This method does the pre-processing for each data set by iterating over the tweets and applying the Process() method'''
    
    n,m=data.shape
    for i in range(n):
        line = data.Tweet[i]
        line=Process(line)
        sentences.append(line)
        if label==True:
            labels.append(data.label[i])
        
        d2=pd.DataFrame(sentences,columns=["tweets"])
    if label==True:
        d1=pd.DataFrame(labels,columns=["label"])
        Proc=pd.concat([d1, d2], axis=1)
    else:
        Proc=d2
        
    
    return Proc

In [7]:
#os.cw'/Users/dlebron/Desktop/Twitter_Proj'

#Load and fix the Sentiment 104 Data.
X=pd.read_csv("training.1600000.processed.noemoticon.csv",encoding = "ISO-8859-1",header=None)
X.columns=['label','A','B','C','D','Tweet']
X=X.drop(["A","B","C","D"],axis=1)
n,m=X.shape

#Subsample 50,000 observations
S=50000
d1=X[1:int(S/2)]
d2=X[-int(S/2):n]

X_sub=pd.concat([d1,d2])
X_sub=X_sub[X_sub.label != 2]
X_sub=X_sub.reset_index()

#Load Trump Data

Trump = pd.read_excel("2017_01_28TrumpTweets.xlsx")#30,385
X_sub.to_csv("Tweets_50k.csv",header=True)

## Process and Save the Dataset

In [None]:
X_proc=Process_data(X_sub,label=True)
T_proc=Process_data(Trump,label=False)

#Check if url's and mentions are removed 

T_proc.to_csv("Trump_Processed.csv",header=True)
X_proc.to_csv("Tweets_50kProc.csv",header=True)

In [None]:
X_sub.Tweet[49880]

In [None]:
X_proc.tweets[49880]