# Code from the DataEspresso.com article on fitness habits during lockdown

<a href="https://dataespresso.com/">https://dataespresso.com/</a>

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('display.max_rows', 500)


In [None]:
def get_tweets_from_hashtag(df,hashtag="london"):
    tmp_df = df[df['text'].str.contains(hashtag, na = False)].copy()
    return tmp_df

def get_num_tweets_per_day(df,hashtag='london'):
    tmp_df_tweets = get_tweets_from_hashtag(df=df,hashtag=hashtag)
    tmp_df = tmp_df_tweets.groupby(by=tmp_df_tweets['day']).count()
    full_df = df.groupby(by=df['day']).count()[["text"]]
    full_df.columns = ["total"]
    tmp_df = pd.concat([tmp_df, full_df], axis=1, join='inner')
    tmp_df = tmp_df[['text','total']]
    tmp_df["hashtag"] = hashtag
    tmp_df.columns=["count","total","hashtag"]
    tmp_df["percentage"] = (tmp_df["count"]/tmp_df["total"])*100
    return tmp_df

def make_multi_property_plots(df,columns=[]):
    if len(columns) == 0:
        to_plot_df = df.copy()
    else:
        to_plot_df = df[columns].copy()
    # creating a x-list from the strings, before converting the datatype to datetime
    # This will be used as x labels later
    x = to_plot_df.index
    to_plot_df.index = pd.to_datetime(to_plot_df.index)

    # defining a list of easy to read colours, source: https://gist.github.com/tsherwen/268a3f2a4b638de299dabe0375970041
    CB_color_cycle = ['#377eb8', '#ff7f00', '#4daf4a',
                      '#f781bf', '#a65628', '#984ea3',
                      '#999999', '#e41a1c', '#dede00']
    cmap = LinearSegmentedColormap.from_list('mycmap', CB_color_cycle)


    fig = plt.figure(figsize=(20,10))
    ax1 = fig.add_subplot(111)
    

    to_plot_df.plot(ax=ax1,cmap=cmap,x_compat=True)
    ax1.set_xticks(x)
    ax1.set_xticklabels(x)
    plt.xticks(rotation=90)
    plt.xlabel('DATE')
    plt.ylabel('Number of Tweets')
    plt.title('Number of Tweets per day containing key word(s) x')
    n = 7 # Showing every n tick, source: https://stackoverflow.com/questions/20337664/cleanest-way-to-hide-every-nth-tick-label-in-matplotlib-colorbar
    [l.set_visible(False) for (i,l) in enumerate(ax1.xaxis.get_ticklabels()) if i % n != 0]

    plt.rcParams.update({'font.size': 20})
    plt.show()
    
    
def get_multiple_hashtag_count(df,key_word_list = []):
    df_list = []

    i = 0
    for word in key_word_list:
        tweets_per_day_df = get_num_tweets_per_day(df=df,hashtag=word)
        tweets_per_day_df.rename(columns = {'count':word, 'percentage': word+"_pct"}, inplace = True)

        column_list = [word,word+"_pct"]
        if i == 0:
            column_list = ['total']+column_list # we only need this once, and we want total to be first

        df_list.append(tweets_per_day_df[column_list])
        i+=1

    merged_df = pd.concat(df_list, axis=1)
    
    return merged_df

def make_time_series_plot(df,hashtag="lockdown"):
    tweets_per_day_df = get_num_tweets_per_day(df=df,hashtag="lockdown")

    x = tweets_per_day_df.index.values
    y = tweets_per_day_df["count"].values

    fig = plt.figure(figsize=(20,10))
    ax1 = fig.add_subplot(111)
    ax1.plot(x,y)
    ax1.set_xticklabels(x)
    plt.xticks(rotation=90)
    ax1.legend([hashtag])
    plt.xlabel('DATE')
    plt.ylabel('Number of Tweets')
    plt.title('Number of Tweets containing the word '+hashtag)
    n = 7  # Keeps every 7th label
    [l.set_visible(False) for (i,l) in enumerate(ax1.xaxis.get_ticklabels()) if i % n != 0]

    plt.rcParams.update({'font.size': 20})
    plt.show()


def line_to_json(text_line):
    
    tweet_dict = {}
    try:
        
        text = text_line.split(" ")
        day_format = datetime.strptime(text[1], "%Y-%m-%d")
        tweet_dict['id'] = text[0]
        tweet_dict['day'] = text[1]
        tweet_dict['timestamp'] = text[1]+" " + text[2]
        tweet_dict["timezone"] = text[3]
        tweet_dict["username"] = text[4]
        tweet_dict['text'] = " ".join(text[5:]).lower()
    except Exception as e:
        print(e)
        print(text_line," --> ",text)
    return tweet_dict


In [None]:
tweet_path = "Documents/tweets_london/london_tweets.txt"
with open(tweet_path) as f:
    tweet_list = list(f)
    
    
new_tweet_list = []

for text in tweet_list :
    new_tweet_list.append(line_to_json(text))
    
df = pd.DataFrame(new_tweet_list)
df['timestamp'] = pd.to_datetime(df['timestamp'],errors="coerce")
display(df.head())

In [None]:
print("Number of tweets",len(tweet_list))

In [None]:
tweets_per_day_df = get_num_tweets_per_day(df=df,hashtag="lockdown")
display(tweets_per_day_df)

In [None]:
make_time_series_plot(df=df,hashtag="lockdown")

In [None]:
key_word_list = ["yoga","lockdown", "run", "gym", "homegym", "walk", "swim", "onlinefitness", "onlineclasses", "workyoga"]

merged_df = get_multiple_hashtag_count(df=df,key_word_list = key_word_list)

In [None]:
make_multi_property_plots(df=new_merged_df,columns=['lockdown',
 'yoga',
 'run',
 'gym',
 'walk',
 'swim',
'homegym'])