In [1]:
import requests 
import pathlib
import ftfy


def process_raw_tweets(tweets):
    
    min_tweet=0
    max_tweet=1553
    max_tweet=len(tweets)

    tweet_objects = []

    for tweet in tweets[min_tweet:max_tweet].split("------------------------------------------------"):
        try:
            tweet_object = {}
            for tweet_info in tweet.split("\n"):
                if len(tweet_info)>0:
                    tweet_obj_column = tweet_info[tweet_info.index("[")+len("["):tweet_info.index("]")]
                    tweet_obj_data = tweet_info.split("]", 1)[1]
                    tweet_object[tweet_obj_column] = tweet_obj_data
            tweet_objects.append(tweet_object)
        except:
            continue
            
    return tweet_objects


def load_tweets(max_tweets=None):
    
    target_url = 'https://media.githubusercontent.com/media/fdac20/EmojiContext/master/rflint-scraper/raw-tweets.txt'
    file_name = 'data/raw-tweets.txt'
    
    p = pathlib.Path(file_name)
    if p.is_file():  # or p.is_dir() to see if it is a directory
        try:
            with p.open() as file:
                print('loading locally.')

                # do awesome stuff
                data = file.read()
                
                if max_tweets is not None:
                    data = ftfy.fix_text(data)
                else:
                    data = ftfy.fix_text(data[:max_tweets])
                return data
        except OSError:
            print('Well darn.')
    else:
        print('loading online.')
        
        data = requests.get(target_url) # it's a file like object and works just like a file
        with open(file_name, "w") as file:
            file.write(data.text)
            
        if max_tweets is not None:
            return ftfy.fix_text(data.text)
        else:
            return ftfy.fix_text(data.text[:max_tweets])


In [2]:
tweets = load_tweets(max_tweets=1000) 
## TO load all tweets
# tweets = load_tweets() 

tweet_objects = process_raw_tweets(tweets)


loading locally.


In [3]:
len(tweet_objects)

903342

In [4]:
import pandas as pd

df = pd.DataFrame(tweet_objects[:100])
## TO load all tweets
#df = pd.DataFrame(tweet_objects[])


In [5]:

import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import string
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def senti_data(filename_pos, filename_neg):
    '''
    Converts a list of positive and negative words into a lists.
    Output: pos: List of all positive words
            neg: List of all negative words
    '''
    positive_vocab = ''
    negative_vocab = ''

    with open(filename_pos, 'r') as f:
        for line in f:
            positive_vocab += line.strip() + ' '

    with open(filename_neg, 'r') as f:
        for line in f:
            negative_vocab += line.strip() + ' '
    pos = []
    neg = []
    pos.append(positive_vocab)
    neg.append(negative_vocab)
    return pos, neg


#Creating Vectorizer and transforming data
cv = TfidfVectorizer(ngram_range=(1,3), stop_words='english', strip_accents='unicode')
tf = cv.fit_transform(df.RETWEET)

#Getting positive and negative words list
pos, neg = senti_data('data/pos.txt','data/neg.txt')

#Converting the words list to vectors
pos_vec = cv.transform(pos)
neg_vec = cv.transform(neg)

#Calculating similarity between each tweet and the positive vector
pos_score = np.asanyarray(cosine_similarity(tf.toarray(),pos_vec))
neg_score = np.asanyarray(cosine_similarity(tf.toarray(),neg_vec))

#Calculating net score, assuming that each tweet would have a positive and negative score associated with it. And the net difference would give me the net orientation of the tweet.
score = pos_score - neg_score

#Getting the top 20 tweets with a positive sentiment
top_tweets = np.argsort(score.ravel())[-20:][::-1]
for index, top_tweet in enumerate(top_tweets):
    print(df.RETWEET[top_tweet],score.ravel()[top_tweet] )


 One best BEST ad season 💪 https//tco/pnfvjFeqoa 0.1588608416183041
 cheap subs holding raffle losers $3 enter Winner get choice 2 FREE feet pics FREE custom voice message MUST FOLLOWING WIN Dm enter 😘 https//tco/FsQxpujEjq 0.15327365148805913
 Michael get 7 bucks buy Chloe tall nonfat Almond milk latte sugar free caramel drizzle 🤔 Reply ur idea imma draw favorite one 0.05817057062000425
 gamsung camping fun easy watch cant imagine happy yong must felt loves type active schedules outdoorsy stuff much im glad casted show 😭 0.055881610904468706
 Whew kid NEVER know peace 🤣 0.04861285246608169
 Whew kid NEVER know peace 🤣 0.04861285246608169
 Whew kid NEVER know peace 🤣 0.04861285246608169
 shoutout talented friend talented go give follow support ❤️ https//tco/zPOdRaH72c 0.0480176765756244
 [TRANS] 201014 Kibum Insta Live Someone asked room light Kibum said one best recent purchases looks like book moment flip open it's light open 360 degrees wireless charging 🤣 #bumkeyk https//tco/yjSkdE