In [6]:
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF
import math
from collections import Counter
import re

def find_emoji(word,df_emoji):
    options = []
    for i, name in enumerate(df_emoji['short_name']):
        if word in name:
            options.append((name,df_emoji['unichar'][i]))
            print(name,df_emoji['unichar'][i])
    return options

def get_emojis(tweet_lst,df_emoji):
    emoji_idx = []
    emoji_char =[]
    for tweet in tweet_lst:
        for i,uni in enumerate(df_emoji['unichar']):
            if uni in tweet:
                 emoji_idx.append(i)
                 emoji_char.append(uni)
    return emoji_idx, emoji_char

def get_emojis_by_tweet(tweet_lst,df_emoji):
    by_tweet = []

    for tweet in tweet_lst:
        emoji_char =[]
        for i,uni in enumerate(df_emoji['unichar']):
            if uni in tweet:
                 emoji_char.append(uni)
        by_tweet.append(emoji_char)
    return by_tweet

#this seems to get some emojis that i dont but also missed some that i do. It also get duplicates per tweet that i dont
# def get_emojis_2(tweet_lst):
#     emojis = []
#     for tweet in tweet_lst:
#         emoji = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
#         emojis.append(emoji.findall(tweet))
#     return emojis

def print_emoji(tweet,emoji_char):
    for uni in emoji_char:
        if uni in tweet:
            print(uni),


if __name__ == '__main__':
    tweets = np.array(list(pickle.load(open('./data/yay_moji.pkl','rb'))))
    # type is list

    emojis = pd.read_pickle('./data/df_emojis.pkl')
    # type is DataFrame



    # ------------- tfidf
    stopwords = set(list(ENGLISH_STOP_WORDS) + ['rt', 'follow', 'dm', 'https', 'ur', 'll' ,'amp', 'subscribe', 'don', 've', 'retweet', 'im', 'http'])

    tfidf = TfidfVectorizer(max_features=10000, max_df=0.05, min_df=0.001, stop_words = stopwords, ngram_range = (1,2))

    #lemmetizing need to consider cleaning the tweets myself

    tfidf_tweets = tfidf.fit_transform(tweets)
    bag = np.array(tfidf.get_feature_names())

    # -------------- NMF
    k = 10
     #number of groups
    nmf2 = NMF(n_components = k)
    nmf2.fit(tfidf_tweets)
    W = nmf2.transform(tfidf_tweets) #len(yay_moji,k)
    H = nmf2.components_ #k,len(yay_moji)


    # --------------- Printing Top 10
    tweet_lst = []
    top = 10
    tweet_in_group_thresh = .001 #score thresh if we consider that tweet as part of that group
    for group in range(k):
        #idx of the top ten words for each group
        i_words = np.argsort(H[group])[::-1][:top]
        words = bag[i_words]

        # idx of the top ten tweets for each group
        i_emojis = np.argsort(W[:,group])[::-1][:top]
        # most common 10 emojis for each group

        print('-'*10)
        print('Group:',group)
        counted_tweets = np.argwhere(W[:,group] > tweet_in_group_thresh)
        print(counted_tweets.shape[0], 'tweets')
        for word in words:
            print('-->',word)
        for i_tweet in i_emojis:
            print_emoji(tweets[i_tweet], emojis['unichar'])
            tweet_lst.append(tweets[i_tweet])
        ind, emo_lst = get_emojis(tweets[i_emojis],emojis)
        # find percentage of emoji per group
        most_emoji, how_many = Counter(emo_lst).most_common(1)[0]
        score = float(how_many)/top
        # print score #score is not perfect - similar emojis and repeat in the same tweet
        print('\n')

    # --------------- printing most common emojis
    most_common = 50
    b,all_emojis = get_emojis(tweets,emojis)
    count = Counter(all_emojis).most_common(most_common)
    unicode_top = []
    for emo, i in count:
        print(emo,i)
#        for j, char in enumerate(emojis['unichar']):
#            if char == emo:
#                unicode_top.append(emojis['unified'][j])


# test stuff
    jan = get_emojis_by_tweet(tweets[0:100],emojis)
    for tweet in jan:
        for emo in tweet:
            print(emo),
        print('')
    # name_of = find_emoji('heart',emojis)


    '''
    to do's
    --> how big are the groups? do a most common
    --> get a better score system
    --> allow for tweets with multiple emojis
    --> sub set for tweets with a specific emoji
    --> commonly combined emojis
    --> naive bayes
        prediction accuracy between emojis for how similar they are
    whats the purpose:
    --> to help use emojis as labels for tweets
    '''

----------
Group: 0
56 tweets
--> btsongma
--> gma
--> bts_twt
--> army
--> ready
--> bts
--> bts army
--> gma ready
--> ready btsongma
--> btsongma bts
💜
💜
💜
💜
💜
💜
💜
💜
🌋
🔥
😂


----------
Group: 1
2 tweets
--> adnloveconnection
--> adnloveconnection lynieg88
--> axle1809
--> carl05290 diosarmendoza
--> carl05290
--> prettymai_0105
--> diosarmendoza axle1809
--> lynieg88 darwaine88
--> lynieg88
--> chie_chie26
😘
😘
😍
🔥
😂
😅
💕
😂
😒
😭
💜


----------
Group: 2
38 tweets
--> derby
--> leeds
--> lampard
--> derby players
--> gesture
--> players
--> spying gesture
--> spying
--> sportbible
--> stop
👀
😂
😍
😂
🐑
😍
💪
😂
😂
🙌
🚨


----------
Group: 3
40 tweets
--> love
--> yes love
--> breakingdawnon405
--> breakingdawnon405 love
--> yes
--> ok
--> screenshotted try
--> try thank
--> screenshotted
--> ok screenshotted
🍓
👌
💜
😭
😍
😍
👇
❤
✨
❤
❤
😍
😘
🎼


----------
Group: 4
67 tweets
--> say
--> just say
--> shit
--> child
--> bjznweh05k
--> jzizzzle_
--> say shit
--> jzizzzle_ just
--> shit bjznweh05k
--> bad
😊

In [2]:
import pickle
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF
import math
from collections import Counter
import re
from model_emoji import get_emojis_by_tweet, print_emoji, get_emojis
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split



def sub_set(tweets,emojis,emo_char):
    sub = []
    by_tweets = get_emojis_by_tweet(tweets,emojis)
    for tweet,emos in zip(tweets,by_tweets):
        if emo_char in tweet:
            sub.append((tweet,emos))
    return sub

def wordize(tweet):
    stops = string.digits + '_@'
    words = tweet.split()
    new_words = []
    for word in words:
        yes = True
        for char in stops:
            if char in word:
                yes = False
        if yes:
            new_words.append(word)

    return ' '.join(new_words)


def NB_acc(emo1, emo2, tweets, emoji):
    a = sub_set(tweets,emojis, emo1)
    b = sub_set(tweets,emojis, emo2)
    labels = np.hstack((np.zeros(len(a)),np.ones(len(b))))
    # me and

    tweets_a, emos_a = zip(*a)
    tweets_b, emos_b = zip(*b)

    tweets = list(tweets_a) + list(tweets_b)

# ------------- tfidf
    stopwords = set(list(ENGLISH_STOP_WORDS) + ['rt', 'follow', 'dm', 'https', 'ur', 'll' ,'amp', 'subscribe', 'don', 've', 'retweet', 'im', 'http'])

    tfidf = TfidfVectorizer(max_features=10000, max_df=0.3, min_df = .001, stop_words = stopwords, ngram_range = (1, 2))

    #lemmetizing need to consider cleaning the tweets myself

    tfidf_tweets = tfidf.fit_transform(tweets)
    bag = np.array(tfidf.get_feature_names())


# ------------------ predict this emoji or another

    # ----- train test split
    np.random.seed(2)

    X_train, X_test, y_train, y_test = train_test_split(tfidf_tweets.todense(),labels)

    nb = GaussianNB()
    mod = nb.fit(X_train,y_train)
    y_pred = mod.predict(X_test)

    acc = np.mean(y_test == y_pred)
    return acc


if __name__ == '__main__':
    tweets1 = np.array(list(pickle.load(open('./data/yay_moji.pkl','rb'))))
    # type is list
    emojis = pd.read_pickle('./data/df_emojis.pkl')
    # type is DataFrame

    #removing words with digits and ['_@']
    tweets = [wordize(tweet) for tweet in tweets1]

    sparkle = u'\u2728'
    laugh_cry = u'\U0001f602'
    heart_face = u'\U0001f60d'
    praise = u'\U0001f64c'
    earth = u'\U0001f30d'

    a,b=get_emojis(tweets, emojis)

    n = 100 #must be even

    # do a similarity matrix instead
    emoji_choice = Counter(b).most_common(100)
    rnd = np.random.randint(0,100,n)
    half_rnd = zip(rnd[:n/2],rnd[n/2:])
    connections = []
    b = emoji_choice
    # for a, b in half_rnd:
    for b in range(10):
        a = 0
        emo1 = emoji_choice[a][0]
        num1 = emoji_choice[a][1]
        emo2 = emoji_choice[b][0]
        num2 = emoji_choice[b][1]
        acc = NB_acc(emo1, emo2, tweets, emojis)
        connections.append([emo1, num1, emo2, num2, acc])
        print(emo1, num1, emo2, num2, acc)
    emo1, num1, emo2, num2, acc = zip(*connections)

    sorted_best = np.argsort(acc)[::-1]
    for i in sorted_best:
        print(emo1[i],num1[i],emo2[i],num2[i],acc[i])

ModuleNotFoundError: No module named 'model_emoji'