# ig_analysis

Read in the datasets created

In [None]:
import os

import pandas as pd
from dotenv import load_dotenv
load_dotenv()

dir_name = r'../data/{}'.format(os.getenv('FOLDER_NAME'))

# The scrapper seems to have some issues, and as a result some rows have more/less items than they should, thus the usage of 'error_bad_lines=False'
followers_df = pd.read_csv('{}/followers.csv'.format(dir_name), error_bad_lines=False)
following_df = pd.read_csv('{}/following.csv'.format(dir_name), error_bad_lines=False)
feed_df = pd.read_csv('{}/feed.csv'.format(dir_name), error_bad_lines=False)

## Analyzing the 'followers.csv' and 'following.csv' files

Let's take a look at the ``followers.csv`` and ``following.csv`` files structure:

In [None]:
followers_df.head(3)

In [None]:
following_df.head(3)

### How many followers does the user have?

In [None]:
print('The user has {} followers'.format(len(followers_df)))

### How many accounts don't follow-back the user?

In [None]:
no_follow_back = set(following_df['username'].values) - set(followers_df['username'].values)
print('{} account(s) do not follow the user back\n'.format(len(no_follow_back)))
print('The no follow-back accounts are: \n\n{}'.format(', '.join(list(no_follow_back))))

## Analyzing the 'feed.csv' file

Let's take a look at the ``feed.csv`` file structure:

In [None]:
feed_df.head(3)

# Analysis of the top-10 user's posts based on likes 

In [None]:
feed_df = feed_df.sort_values('likes_count', ascending=False)

In [None]:
import datetime as dt
from collections import Counter
import matplotlib.pyplot as plt

def get_hashtags(text):
    '''
    Return a list of hashtags found in text
    '''
    return [x for x in text.split() if x.startswith("#")]

days = []
times = []
hashtags = []
words = []
for idx in range(10):
    row = feed_df.T[idx]
    
    date = dt.datetime.fromtimestamp(row['taken_at'])
    days.append(date.strftime('%A'))
    
    times.append(date.strftime('%H'))
    for hashtag in get_hashtags(row['text']):
        hashtags.append(hashtag)
    
    # TODO: Remove stop words
    for word in row['text'].split(' '):
        words.append(word)

In [None]:
def render_pie_chart(tuple_list, title):
    '''
    Given a list of tuples of the form [(label, size)], render a pie chart with the specified title
    '''
    labels = [x[0] for x in tuple_list]
    sizes = [x[1] for x in tuple_list]
    
    fig1, ax = plt.subplots()
    ax.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
    ax.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle
    ax.set(title=title)
    plt.show()    

In [None]:
top_days = Counter(days).most_common(3)
top_times = Counter(times).most_common(3)
top_hashtags = Counter(hashtags).most_common(3)
top_words = Counter(words).most_common(3)

render_pie_chart(top_days, 'What days was the most liked content posted?')
render_pie_chart(top_times, 'What time were the posts most frequently made? (0:00-24:00)')
render_pie_chart(top_hashtags, 'What are the most popular hashtags?')
render_pie_chart(top_words, 'What words were most frequently used?')

In [None]:
msg = 'It is thus recommended to post on {} around {} o\'clock using the hash-tags \'{}\' and words \'{}\'.'.format(
            ', '.join([x[0] for x in top_days]),
            top_times[0][0],
            ', '.join([x[0] for x in top_hashtags]),
            ', '.join([x[0] for x in top_words]))
print(msg)