In [464]:
import pandas as pd
import numpy as np
import emoji
import string
from nltk.tokenize import TweetTokenizer
import csv
from collections import defaultdict
from scipy.sparse import coo_matrix

<pre> 

The following methods are only used to clean and save the original data and should not be run
    
    <b>The cleaned data can be found in emoji_datasets/all_data.csv</b>

</pre>

In [465]:
# Clean data

def load_test_data(file_path):
    with open(file_path) as fp:
        result = []
        translator = str.maketrans('', '', string.punctuation)
        line = fp.readline()
        while line:
            line = line.strip()
            line = line.split(',',1)
            if len(line) == 2:
                clean_row1 = line[1].translate(translator)
                clean_row2 = clean_row1.replace(chr(8220),'')
                clean_row3 = clean_row2.replace(chr(8221),'')
                line = [line[0], clean_row3]
                value = np.array([line[0], line[1]])
                result.append(value)
            line = fp.readline()
        return pd.DataFrame(np.array(result, dtype='object'))
    
def extract_emojis(example):
    return (' '.join(c for c in example if c in emoji.UNICODE_EMOJI)).split()

def prune_dataset_emojis(data):
    result = []
    translator = str.maketrans('', '', string.punctuation)
    for i,row in enumerate(data):
        try:
            if extract_emojis(row[1]) != []:
                clean_row1 = row[1].translate(translator)
                clean_row2 = clean_row1.replace(chr(8220),'')
                clean_row3 = clean_row2.replace(chr(8221),'')
                new_row = np.array([row[0], clean_row3])
                result.append(new_row)
        except TypeError:
            pass
    return pd.DataFrame(np.array(result))

In [466]:
# Clean data and write to CSV

train_data_raw = pd.read_csv('emoji_datasets/data_train.csv', header=None, encoding='utf-8')

test_data_raw = load_test_data('emoji_datasets/data_test.txt')

train_data_clean = prune_dataset_emojis(train_data_raw.values)
test_data_clean = prune_dataset_emojis(test_data_raw.values)

all_data_clean_np = np.vstack((train_data_clean.values, test_data_clean.values))
np.random.shuffle(all_data_clean_np)
all_data_clean = pd.DataFrame(all_data_clean_np)

all_data_clean.to_csv('emoji_datasets/all_data.csv', header=None, index=False, encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC)

<pre> END : clean data </pre>

In [467]:
tweets_and_labels_RAW = pd.read_csv('emoji_datasets/all_data.csv', header=None, encoding='utf-8')

In [504]:
def separate_emojis(example):
    result = []
    ptr = 0
    for i,c in enumerate(example):
        if c in emoji.UNICODE_EMOJI:
            split = example[ptr:i]
            if split != '':
                result.append(split)
                result.append(c)
            else:
                result.append(c)
            ptr = i+1
    return result

def preprocess(data):
    labels = list(data[:,0])
    tweets = list(data[:,1])
    result = []
    tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    for i,twt in enumerate(tweets):
        clean_tokens = []
        tokens = tweet_tokenizer.tokenize(twt)
        for j,tk in enumerate(tokens):
            tk = tk.lower()
            sep = separate_emojis(tk)
            if sep != []:
                clean_tokens = clean_tokens + sep
            else:
                clean_tokens.append(tk)
        result.append((labels[i], clean_tokens))
    return result

def find_all_emojis(data):
    emoji_dict = defaultdict(int)
    for twt in data:
        for word in twt[1]:
            if word in emoji.UNICODE_EMOJI:
                emoji_dict[word] += 1
    return emoji_dict

def term_context_matrix(targets, data):
    tc_matrix = defaultdict(lambda: defaultdict(lambda: 0))
    for twt in data:
        for w1 in targets:
            if w1 in twt[1]:
                for w2 in twt[1]:
                    tc_matrix[w1][w2]+=1
    return tc_matrix

def vocab_map(dd):
    vocab = {}
    vocab_id = 0
    for k,v in dd.items():
        for k2,v2 in v.items():
            if k2 not in vocab.keys():
                vocab[k2] = vocab_id
                vocab_id += 1
    return vocab

def term_to_int_dd(dd):
    num_rows = len(dd.keys())
    data = defaultdict(int)
    vocab_dict = vocab_map(dd)
    for i,r in enumerate(dd.keys()):
        for j,c in enumerate(dd[r].keys()):
            data[i,vocab_dict[c]] = dd[r][c]
            
    return data

def term_to_sparse(dd):
    dd_int = term_to_int_dd(dd)
    vs = [v for (i,j), v in dd_int.items()]
    ii = [i for (i,j), v in dd_int.items()]
    jj = [j for (i,j), v in dd_int.items()]
    matrix = coo_matrix((vs, (ii, jj)))
    return matrix



In [469]:
tweets_and_labels = preprocess(tweets_and_labels_RAW.values)
emoji_counts = find_all_emojis(tweets_and_labels)
emoji_targets = list(emoji_counts.keys())

In [470]:
term_matrix = term_context_matrix(emoji_targets, tweets_and_labels)

In [505]:
emoji_coo_matrix = term_to_sparse(term_matrix)

In [506]:
emoji_coo_matrix.toarray().shape

(606, 24726)

In [508]:
emoji_coo_matrix.toarray()

array([[  1,   5,  11, ...,   0,   0,   0],
       [  1,   7,  11, ...,   0,   0,   0],
       [  0, 174, 239, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0]])

In [509]:
# Debugging methods

def count_distinct_vocab(dd):
    count = 0
    seen = defaultdict(int)
    for k,v in dd.items():
        for k2,v2 in v.items():
            if k2 not in seen.keys():
                count +=1
                seen[k2] = 1
    return count

def get_max_val(dd):
    mx = 0
    rest = 0
    for k,v in dd.items():
        for k2,v2 in v.items():
            if v2 > mx:
                mx = v2
            else:
                rest += v2
    return mx, rest

def get_max_keys(dd):
    mx = 0
    for k,v in dd.items():
        for k2,v2 in v.items():
            if len(k2) > mx:
                mx = len(k2)
    return mx

count_distinct_vocab(term_matrix)
#get_max_val(term_matrix)
#get_max_keys(term_matrix)

24726