##  tweets to text

#### Eryk Wdowiak and Eric Adsetts

save tweets to text file, which we'll use for subword splitting

In [1]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords

In [2]:
##  load data
data = pd.read_csv('dataset/judge-1377884607_tweet_product_company_v2-clean.csv')
data.columns = ['tweet','direction','emotion']
# data.shape  # (9093, 3)

##  remove rows without tweet
data = data.dropna(subset=['tweet','emotion'],axis='index')
# data.shape  # (9092, 3)

##  clean emotions
emo_dict = {'Negative emotion':'negative', 
            'Positive emotion':'positive',
            'No emotion toward brand or product':'neutral', 
            "I can't tell":'neutral'}
data['emotion'] = data['emotion'].replace(emo_dict)
del emo_dict

emo_int_dict = {'negative':0, 'neutral':1, 'positive':2 } 
data['emot_int'] = data['emotion'].replace(emo_int_dict)
del emo_int_dict

##  define company and product
##  first convert NaN to a string
data['direction'] = data['direction'].map('{}'.format)

##  define company
comp_dict = {'iPhone':'Apple', 
             'iPad or iPhone App':'Apple', 
             'iPad':'Apple', 
             'Google':'Google', 
             'nan':'unknown', 
             'Android':'Google',
             'Apple':'Apple',
             'Android App':'Google', 
             'Other Google product or service':'Google',
             'Other Apple product or service':'Apple'}
data['company'] = data['direction'].replace(comp_dict)
del comp_dict

##  define product
prod_dict = {'iPhone':'device', 
             'iPad or iPhone App':'software', 
             'iPad':'device', 
             'Google':'company', 
             'nan':'unknown', 
             'Android':'device',
             'Apple':'company',
             'Android App':'software', 
             'Other Google product or service':'other',
             'Other Apple product or service':'other'}
data['product'] = data['direction'].replace(prod_dict)
del prod_dict

##  let's take a look
# data.head(10)

In [3]:
##  prepare stop word list
stopwords_list = []
# stopwords_list += stopwords.words('english')
stopwords_list += list(string.punctuation)
# stopwords_list += ['0','1','2','3','4','5','6','7','8','9']

##  how to process tweets
def process_tweets(tweet):
    lower_text = tweet.replace('\n',' ').lower()
    tokens = word_tokenize(lower_text)
    tokens = [token for token in tokens if token not in stopwords_list]
    token_str = ' '.join(tokens)
    token_str = token_str.replace(' \'','\'').replace('/',' ')
    return token_str

##  process tweets
data['tweet'] = list(map(process_tweets, list(data['tweet'])))

In [4]:
##  train test split
X_train, X_test, y_train, y_test = train_test_split(
    data[['tweet','direction','company','product']], data['emotion'], 
    test_size=0.20, random_state=42)

In [5]:
##  rejoin them
data_train = X_train.join(y_train)
data_test  = X_test.join(y_test)

##  save them
data_train.to_csv('dataset/judge-1377884607_zz_data-train.csv',index=None)
data_test.to_csv('dataset/judge-1377884607_zz_data-test.csv',index=None)

##  and save tweets separately
data_train['tweet'].to_csv('dataset/judge-1377884607_zz_tweets-train_v0.txt',
                           header=None, index=None, sep=' ')
data_test['tweet'].to_csv('dataset/judge-1377884607_zz_tweets-test_v0.txt',
                          header=None, index=None, sep=' ')