In [1]:
import collections
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
import nltk

In [2]:
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
#nltk.download('stopwords')

In [None]:
tweet_df_path = '/Users/evanmcneal/Desktop/Projects/athena/data_igsn/data/tweet_df.csv'

In [3]:
tweets = pd.read_csv(tweet

FileNotFoundError: [Errno 2] File tweet_df.csv does not exist: 'tweet_df.csv'

In [None]:
tweets['fav_rate'] = tweets['Num_Favorites']/tweets['Num_Followers'] * 100
tweets['retweet_rate'] = tweets['Num_Retweets']/tweets['Num_Followers'] * 100

In [None]:
tweets

In [None]:
def get_wordnet_pos(word):
    """
    Map POS tag to first character lemmatize() accepts
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


def preprocess(tweet: str, additional_stopwords: set) -> str:
    text_clean = tweet.lower()

    # Remove non-alphabet
    text_clean = re.sub(r'[^a-zA-Z]|(\w+:\/\/\S+)',' ', text_clean).split()

    # Remove short words (length < 3)
    text_clean = [w for w in text_clean if len(w)>2]

    # Lemmatize text with the appropriate POS tag
    lemmatizer = WordNetLemmatizer()
    text_clean = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text_clean]

    # Filter out stop words in English 
    stops = stopwords.words('english')+ additional_stopwords
    text_clean = [w for w in text_clean if w not in stops]
    
    return text_clean

In [None]:
additional_stopwords = ['amp', 'get', 'one', 'go', 'day', 'say', 'make', 'new']

In [None]:
clean_tweets = tweets.copy()
clean_tweets['Tweet'] = clean_tweets['Tweet'].apply(
    preprocess, 
    args=(additional_stopwords,)
)

In [None]:
clean_tweets

In [None]:
from collections import Counter
tweet_list = list(clean_tweets['Tweet'])
counts = Counter(x for xs in tweet_list for x in set(xs))

In [None]:
counts.most_common(10)

In [None]:
keyword_tuples = []

for count in counts.most_common(10):
    keyword = count[0]
    
    total_favs = 0
    total_retweets = 0
    total_retweet_rate = 0.0
    total_fav_rate = 0.0
    count = 0
    
    for index, row in clean_tweets.iterrows():
        
        if keyword in row['Tweet']:
            
            total_favs += row['Num_Favorites']
            total_retweets += row['Num_Retweets']
            total_retweet_rate = total_retweet_rate + row['retweet_rate']
            total_fav_rate = total_fav_rate + row['fav_rate']
            count += 1

    avg_retweet_rate = total_retweet_rate / count
    avg_fav_rate = total_fav_rate / count
    
    keyword_tuples.append(
        (
            keyword,
            total_favs,
            total_retweets,
            avg_retweet_rate,
            avg_fav_rate,
            count
        )
    )
    
keyword_df = pd.DataFrame(
    keyword_tuples, 
    columns = [
        "keyword",
        "num_favorites", 
        "num_retweets", 
        "avg_retweet_rate",
        "avg_fav_rate",
        "count"
    ]
)

keyword_df

In [None]:
plt.style.use('fivethirtyeight')

# data to plot
n_groups = 10
favs = keyword_df['num_favorites']
retweets = keyword_df['num_retweets']

# create plot
fig=plt.figure(
    figsize=(8, 6), 
    dpi= 80, 
    facecolor='w', 
    edgecolor='k'
)

index = np.arange(n_groups)
bar_width = 0.35
opacity = 0.8

rects1 = plt.bar(
    index, 
    favs, 
    bar_width,
    alpha=opacity,
    label='Favorites'
)

rects2 = plt.bar(
    index + bar_width, 
    retweets, 
    bar_width,
    alpha=opacity,
    label='Retweets'
)

plt.ylabel('Total', size=17)
plt.xlabel('Keyword', size=17)
plt.title('Favorites and Retweets by Keyword', size=20)
plt.xticks(index + bar_width, keyword_df['keyword'], rotation=-45)
plt.legend(prop={'size': 12})

plt.tight_layout()
plt.show()

In [None]:
plt.style.use('fivethirtyeight')

# data to plot
n_groups = 10
fav_rate = keyword_df['avg_fav_rate']
retweet_rate = keyword_df['avg_retweet_rate']

# create plot
fig=plt.figure(
    figsize=(8, 6), 
    dpi= 80, 
    facecolor='w', 
    edgecolor='k'
)

index = np.arange(n_groups)
bar_width = 0.35
opacity = 0.8

rects1 = plt.bar(
    index, 
    fav_rate, 
    bar_width,
    alpha=opacity,
    label='Favorites'
)

rects2 = plt.bar(
    index + bar_width, 
    retweet_rate, 
    bar_width,
    alpha=opacity,
    label='Retweets'
)

plt.ylabel('Total', size=17)
plt.xlabel('Keyword', size=17)
plt.title('Avg Favorite and Retweet Rate Per Follower', size=20)
plt.xticks(index + bar_width, keyword_df['keyword'], rotation=-45)
plt.legend(prop={'size': 12})

plt.tight_layout()
plt.show()

In [None]:
tweets[tweets['retweet_rate'] > 20]

In [None]:
tweets['Num_Favorites'].hist(bins=100)

In [None]:
tweets['log(Num_Favorites)'] = np.log(tweets["Num_Favorites"])
tweets['log(Num_Retweets)'] = np.log(tweets["Num_Retweets"])

tweets['log(fav_rate)'] = np.log(tweets["fav_rate"])
tweets['log(retweet_rate)'] = np.log(tweets["retweet_rate"])

In [None]:
tweets['log(Num_Favorites)'].hist(bins=100)

In [None]:
tweets['log(Num_Retweets)'].hist(bins=100)

In [None]:
tweets['log(fav_rate)'].hist(bins=100)

In [None]:
tweets['log(retweet_rate)'].hist(bins=100)

In [None]:
tweets.sort_values("Created_At")