In [21]:
import pandas as pd
import emoji 
import codecs

df = pd.read_csv("proc_tweets.csv",low_memory=False, nrows=100000, usecols=["TIME POSTED","TWITTER'S TIME POSTED","RETWEET"])

df["emoji"] = df["RETWEET"].apply(lambda x: [i for i in str(x).split() if i in emoji.UNICODE_EMOJI])



In [None]:
import numpy as np
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import string
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def senti_data(filename_pos, filename_neg):
    '''
    Converts a list of positive and negative words into a lists.
    Output: pos: List of all positive words
            neg: List of all negative words
    '''
    positive_vocab = ''
    negative_vocab = ''

    with open(filename_pos, 'r') as f:
        for line in f:
            positive_vocab += line.strip() + ' '

    with open(filename_neg, 'r') as f:
        for line in f:
            negative_vocab += line.strip() + ' '
    pos = []
    neg = []
    pos.append(positive_vocab)
    neg.append(negative_vocab)
    return pos, neg


#Creating Vectorizer and transforming data
cv = TfidfVectorizer(ngram_range=(1,3), stop_words='english', strip_accents='unicode')
tf = cv.fit_transform(df['RETWEET'].apply(lambda x: np.str_(x)))

#Getting positive and negative words list
pos, neg = senti_data('data/pos.txt','data/neg.txt')

#Converting the words list to vectors
pos_vec = cv.transform(pos)
neg_vec = cv.transform(neg)

#Calculating similarity between each tweet and the positive vector
pos_score = np.asanyarray(cosine_similarity(tf.toarray(),pos_vec))
neg_score = np.asanyarray(cosine_similarity(tf.toarray(),neg_vec))

#Calculating net score, assuming that each tweet would have a positive and negative score associated with it. And the net difference would give me the net orientation of the tweet.
score = pos_score - neg_score

# #Getting the top 20 tweets with a positive sentiment
# top_tweets = np.argsort(score.ravel())[-20:][::-1]
# for index, top_tweet in enumerate(top_tweets):
#     print(df.RETWEET[top_tweet],score.ravel()[top_tweet] )

df["score"] = score




In [None]:
df.head()

In [None]:
tracked_emoticons = df["emoji"].explode().unique()[1:40]

for emoticon in tracked_emoticons:
    df[emoticon]=0
    
for index, row in df.iterrows():
    
    for emoji in row["emoji"]:
        if emoji in tracked_emoticons:
            df.loc[index, emoji] = row["score"]

        
        

In [None]:
df.columns

In [None]:
# df[["TWITTER'S TIME POSTED",
#        '😭', '❤️', '😅', '💪', '😁', '🇺🇦', '👏', '🙄', '🥵', '🔥', '🌶', '😂', '👍', '🙏',
#        '😘', '😍', '🤣', '😉', '🔴', '🤔', '💕', '☺️', '©', '🎉', '😊', '🤩', '💫', '😻',
#        '💙', '🍑', '🇵🇹', '💖', '♥️', '🙏🏽', '💯', '😆', '👀', '😜', '😏']].sort_values(by="TWITTER'S TIME POSTED").cumsum().plot(figsize=(50,20), fontproperties="prop")


In [None]:
import matplotlib.pyplot as plt

df["TWITTER'S TIME POSTED"] = pd.to_datetime(df["TWITTER'S TIME POSTED"], format='%Y-%m-%d %H:%M:%S')
df.set_index(["TWITTER'S TIME POSTED"],inplace=True)



In [None]:
df

In [None]:


ax = df[[
       '😭', '❤️', '😅', '😁', '😂']].cumsum().plot(figsize=(13,5),ylabel="Sentiment")

plt.show()
