# Project : Medium Data Analytics Leaderboard System

### _Notebook: Data Visualization and Dashboard_
![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)

In [1]:
# import the libraries
import pandas as pd
import json
import neo4j
from pandas import DataFrame
from py2neo import Graph, Node,Relationship

from curses import COLOR_BLUE
from turtle import color
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from matplotlib.ticker import FormatStrFormatter

ModuleNotFoundError: No module named 'neo4j'

In [2]:
# Establish connection from Jupyter notebook to Neo4j database
# host: bolt://localhost:7687
# username: admin
# pass: ###

with open('Group-08_Neo4j_Credentials.json') as f:
    data = json.load(f)
    username = data['username']
    password = data['password']

try:
    graphDB = Graph("bolt://localhost:7687", auth=(username, password))
    print("Connection with Neo4j Established")
except Exception as e:
    print("Failed to connect the driver:", e)

FileNotFoundError: [Errno 2] No such file or directory: 'Group-08_Neo4j_Credentials.json'

In [3]:
# consolidated cypher query..
# -----------------------------------------------------------------

g1_CQL = '''MATCH (article:Article)-[:HAS]->(Image) 
            MATCH (article)-[:AWARDED]->(claps:Claps)
            RETURN Image.Boolean, sum(toInteger(claps.Count))
        '''
g1_df = pd.DataFrame(graphDB.run(g1_CQL))
g1_df.columns = ['Image', 'Total Claps']

# -----------------------------------------------------------------

g2_CQL = '''MATCH (article:Article)-[:CATEGORIZED_AS]->(category:Category)
            MATCH (article)-[:TAKES]->(readingTime:ReadingTime)
            RETURN category.Type, AVG(toInteger(readingTime.Minutes))
            ORDER BY AVG(toInteger(readingTime.Minutes)) DESC LIMIT 5
            UNION
            MATCH (article:Article)-[:CATEGORIZED_AS]->(category:Category)
            MATCH (article)-[:TAKES]->(readingTime:ReadingTime)
            RETURN category.Type, AVG(toInteger(readingTime.Minutes))
            ORDER BY AVG(toInteger(readingTime.Minutes)) ASC LIMIT 5
        '''
g2_df = pd.DataFrame(graphDB.run(g2_CQL))
g2_df.columns = ['Category', 'Average number of claps']

# -----------------------------------------------------------------

g3_CQL = '''MATCH (article:Article)-[:PUBLISHED_IN_YEAR]->(year:Year)
             MATCH (article)-[:AWARDED]->(claps:Claps)
             RETURN  year.Year , AVG(toInteger(claps.Count))
        '''
g3_df = pd.DataFrame(graphDB.run(g3_CQL))
g3_df.columns = ['Year', 'Average number of claps']

# -----------------------------------------------------------------

g4_CQL = '''MATCH (article:Article)-[:PUBLISHED_BY]->(publication:Publication) 
             MATCH (article)-[:AWARDED]->(claps:Claps)
             RETURN  publication.Name, AVG(toInteger(claps.Count))
             order by AVG(toInteger(claps.Count)) DESC LIMIT 10
        '''
g4_df = pd.DataFrame(graphDB.run(g4_CQL))
g4_df.columns = ['Publication Name', 'Average number of claps']

# -----------------------------------------------------------------

g5_CQL = '''MATCH (author:Author)-[:COMPOSED]->(article:Article)
             MATCH (article)-[:AWARDED]->(claps:Claps)
             RETURN author.Name, SUM(toInteger(claps.Count))
             ORDER BY SUM(toInteger(claps.Count)) DESC LIMIT 10
        '''
g5_df = pd.DataFrame(graphDB.run(g5_CQL))
g5_df.columns = ['Author Name', 'Total number of claps']

# -----------------------------------------------------------------

g6_CQL = '''MATCH (author:Author)-[:COMPOSED]->(article:Article)
             MATCH (article)-[:CATEGORIZED_AS]->(category:Category)
             RETURN category.Type, COUNT(author.Name)
             ORDER BY COUNT(author.Name) DESC LIMIT 10
        '''
g6_df = pd.DataFrame(graphDB.run(g6_CQL))
g6_df.columns = ['Category Name', 'Total number of authors associated']

# -----------------------------------------------------------------

g7_CQL = '''MATCH (article:Article)-[:PUBLISHED_BY]->(publication:Publication)
            WHERE publication.Name <> 'No Publication'
            RETURN publication.Name, COUNT(article.Title)
            ORDER BY COUNT(article.Title) DESC LIMIT 10
        '''
g7_df = pd.DataFrame(graphDB.run(g7_CQL))
g7_df.columns = ['Publications', 'Count of articles']

# -----------------------------------------------------------------

g8_CQL = '''MATCH (article:Article)-[:PUBLISHED_IN_YEAR]->(year:Year)
            MATCH (article)-[:PUBLISHED_IN_MONTH]->(month:Month)
            MATCH (article)-[:PUBLISHED_ON_DAY]->(day:Day)
            RETURN toInteger(year.Year) as Year,toInteger(month.Month) as Month, toInteger(day.Day) as Day
        '''
g8_df = pd.DataFrame(graphDB.run(g8_CQL))
g8_df.columns = ['Year', 'Month', 'Day']

# -----------------------------------------------------------------

g9_CQL = '''MATCH (article:Article)-[:AWARDED]->(claps:Claps)
            MATCH (article)-[:TAKES]->(readingTime:ReadingTime)
            RETURN article.Title, toInteger(readingTime.Minutes), toInteger(claps.Count)
        '''
g9_df = pd.DataFrame(graphDB.run(g9_CQL))
g9_df.columns = ['Title', 'reading_time_in_minutes', 'claps_count']

# -----------------------------------------------------------------

g10_CQL = '''MATCH (article:Article)-[:PUBLISHED_IN_YEAR]->(year:Year)
            MATCH (article)-[:PUBLISHED_IN_MONTH]->(month:Month)
            MATCH (article)-[:PUBLISHED_ON_DAY]->(day:Day)
            RETURN toInteger(year.Year) as Year,toInteger(month.Month) as Month, toInteger(day.Day) as Day
        '''
g10_df = pd.DataFrame(graphDB.run(g10_CQL))
g10_df.columns = ['Year', 'Month', 'Day']

g10_df['Date'] = pd.to_datetime(g10_df,format='%Y-%m-%d', errors='coerce')
g10_df['month'] = g10_df.Date.dt.month_name()
g10_df['quarter'] = g10_df.Date.dt.quarter
g10_df.quarter = g10_df.quarter.astype(str)
g10_df['quarter'] = 'Q' + g10_df.quarter.astype(str)
g10_df['quarter'] = g10_df['quarter'].replace('Q1.0','Q1')
g10_df['quarter'] = g10_df['quarter'].replace('Q2.0','Q2')
g10_df['quarter'] = g10_df['quarter'].replace('Q3.0','Q3')
g10_df['quarter'] = g10_df['quarter'].replace('Q4.0','Q4')
g10_df = g10_df[['Year','quarter']]

# get count based on Year and quarter groups
g10_df_new = g10_df.groupby(['Year','quarter']).agg(count=("quarter", 'count'))
g10_df_new = g10_df_new.reset_index()
g10_df_new = g10_df_new[g10_df_new['quarter'] != 'Qnan']

NameError: name 'graphDB' is not defined

In [4]:
# graph 1
def graph1(g1_df):
    plt.figure(figsize=(10,7))
    g = sns.barplot(x='Image', y='Total Claps', data=g1_df, palette="rocket")
    g.set(xlabel='Image Added to Article', ylabel='Total Number of Claps', title='Total Claps vs Image')
    plt.ticklabel_format(style='plain', axis='y')
    plt.show()

graph1(g1_df)

NameError: name 'g1_df' is not defined

- <font color='Maroon'> Images make the articles engaging and attractive and hence they are necessary part of the blog,
well atleast the cover image because it can decide wether user will click the link and look at the blog or not. 
And due to this using the right image extension is also necessary because it will affect the performance of the blog. Because if the webpage takes a long time to load because of it's resources i.e image in this case, people are just going to move on.
</font>

In [5]:
#graph 2:
def graph2(g2_df):
    plt.figure(figsize=(14,7))
    g = sns.barplot(x='Category', y='Average number of claps', data=g2_df, palette="rocket")
    g.set(xlabel='Category of Article', ylabel='Average number of claps', title='Average number of claps per category')
    plt.ticklabel_format(style='plain', axis='y')
    plt.xticks(rotation=45)

graph2(g2_df)

NameError: name 'g2_df' is not defined

In [6]:
# graph 3
def graph3(g3_df):
    plt.figure(figsize=(11,7))
    g = sns.barplot(x='Year', y='Average number of claps', data=g3_df, palette="rocket")
    g.set(xlabel='Year', ylabel='Average number of claps', title='Average number of claps per Year')
    plt.ticklabel_format(style='plain', axis='y')

graph3(g3_df)

NameError: name 'g3_df' is not defined

In [7]:
# graph 4
def graph4(g4_df):
    plt.figure(figsize=(14,7))
    g = sns.barplot(x='Publication Name', y='Average number of claps', data=g4_df, palette="rocket")
    g.set(xlabel='Publication Name', ylabel='Average number of claps', title='Average number of claps per Publication')
    plt.ticklabel_format(style='plain', axis='y')
    plt.xticks(rotation=45)

graph4(g4_df)

NameError: name 'g4_df' is not defined

In [8]:
# graph 5
def graph5(g5_df):
    plt.figure(figsize=(14,7))
    g = sns.barplot(x='Author Name', y='Total number of claps', data=g5_df, palette="rocket")
    g.set(xlabel='Author Name', ylabel='Total number of claps', title='Total number of claps vs Author')
    plt.ticklabel_format(style='plain', axis='y')
    plt.xticks(rotation=45)

graph5(g5_df)

NameError: name 'g5_df' is not defined

In [9]:
# graph 6
def graph6(g6_df):
    plt.rcParams["figure.figsize"] = (12,6)
    g6_df.plot.bar(x='Category Name', y='Total number of authors associated', 
                    title='Total number of authors associated vs Category',
                    color='#009999')
    plt.xticks(rotation=45)

graph6(g6_df)

NameError: name 'g6_df' is not defined

In [10]:
# graph 7
plt.figure(figsize=(8, 8))
colors= ['#FF0000','#0000FF','#FFFF00','#ADFF2F','#FFA500','#FFEFD5','#556B2F','#990066','#660099','#6699CC']
explode= (0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05)
plt.pie(g7_df['Count of articles'], colors=colors, labels=g7_df['Publications'], autopct='%1.1f%%', pctdistance=0.85, explode=explode)
centre_circle= plt.Circle((0,0),0.40, fc='white')
fig= plt.gcf()
fig.gca().add_artist(centre_circle)
plt.title('Total number of articles by publication')
plt.show()

NameError: name 'plt' is not defined

In [11]:
# graph 8
g8_df['Date'] = pd.to_datetime(g8_df,format='%Y-%m-%d', errors='coerce')
articles_df = g8_df.Date.value_counts().rename_axis('dates').reset_index(name='counts')
articles_df['month'] = articles_df.dates.dt.month_name()
articles_df['day_of_week'] = articles_df.dates.dt.day_name()
rest_data = articles_df.pivot_table(index='month', columns='day_of_week', values='counts',  aggfunc='sum', fill_value=0)
rest_data = rest_data[['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']]
plt.figure(figsize=(12,6))
plt.title("When were the blogs posted?")
sns.heatmap(rest_data, cmap="Greens", linewidths=.5)
plt.show()

NameError: name 'g8_df' is not defined

Now that we understood what is the best day to post an article, let's try to get understand how long should the article be.

We are going to plot the graph with number of claps vs reading time of the article. Assumtion is that a *person will give a clap to article only when he reads it completly and finds it useful/entertaining*.

In [12]:
# graph 9
def graph9(g9_df):
    plt.figure(figsize=(13,7))
    plt.xlabel("Reading time in Minutes")
    plt.ylabel("Number of Claps")
    plt.tight_layout()
    plt.title("Number of Claps vs Reading time of the article")
    sns.scatterplot(x=g9_df.reading_time_in_minutes,y=g9_df.claps_count)

graph9(g9_df)

NameError: name 'g9_df' is not defined

- <font color='Maroon'> Yay, we got our graph!! From the above scatterplots we can say that the articles with reading time of 5 to 10 minutes have huge number of claps. So we can say that ideal reading time for an article should be 5 - 10 minutes
    Along with the claps the comments/responses are also the important factors in the article writing because they help us to understand quality of blog i.e is it well-written / it's plagarised / it is helpful / conversation starter, etc.
    We can also do further sentimental analysis on it to understand if it's good or bad....
</font>

In [13]:
# graph 10
def graph10(g10_df):    
    plt.gcf().set_size_inches(14, 8)
#     p = sns.catplot(x="quarter", y="count", hue="Year",kind="bar", data=g10_df_new, height=5, aspect=2).set(title='QOQ Analysis of Published Articles')
    p = sns.barplot(x="quarter",y="count",hue="Year",data=g10_df_new)
    p.set(xlabel = 'Quarter', ylabel = 'Count of articles published', title='QOQ Analysis of Published Articles')
    
graph10(g10_df)

NameError: name 'g10_df' is not defined

# DASHBOARD

In [14]:
import patchworklib as pw

ModuleNotFoundError: No module named 'patchworklib'

In [15]:
ax100 = pw.Brick("ax100",figsize=(5,7))
sns.barplot(x='Image', y='Total Claps', data=g1_df, palette="rocket", ax=ax100)
ax100.set_title("'Total Claps vs Image'")

NameError: name 'pw' is not defined

In [16]:
ax200 = pw.Brick("ax200",figsize=(10,5))
sns.barplot(x='Category', y='Average number of claps', data=g2_df, palette="rocket", ax=ax200)
ax200.set_title("'Average number of claps per category'")

NameError: name 'pw' is not defined

In [17]:
ax300 = pw.Brick("ax300",figsize=(5,7))
sns.barplot(x='Year', y='Average number of claps', data=g3_df, palette="rocket", ax=ax300)
ax300.set_title("'Average number of claps per Year'")

NameError: name 'pw' is not defined

In [18]:
ax400 = pw.Brick("ax400",figsize=(10,5))
sns.barplot(x='Publication Name', y='Average number of claps', data=g4_df, palette="rocket", ax=ax400)
ax400.set_title("'Average number of claps per Publication'")

NameError: name 'pw' is not defined

In [19]:
ax500 = pw.Brick("ax500",figsize=(10,5))
sns.barplot(x='Author Name', y='Total number of claps', data=g5_df, palette="rocket", ax=ax500)
ax500.set_title("'Total number of claps vs Author'")

NameError: name 'pw' is not defined

In [20]:
ax600 = pw.Brick("ax600",figsize=(10,5))
g6_df.plot.bar(x='Category Name', y='Total number of authors associated', 
                    title='Total number of authors associated vs Category',
                    color='#009999', ax=ax600)
ax600.set_title("'Total number of authors associated vs Category'")

NameError: name 'pw' is not defined

In [21]:
ax700 = pw.Brick("ax700",figsize=(10,5))
ax700.pie(g7_df['Count of articles'], colors=colors, labels=g7_df['Publications'], autopct='%1.1f%%', pctdistance=0.85, 
        explode=explode)
ax700.set_title("'Total number of articles by publication'")

NameError: name 'pw' is not defined

In [22]:
ax800 = pw.Brick("ax800",figsize=(10,5))
sns.heatmap(rest_data, cmap="Greens", linewidths=.5, ax=ax800)
ax800.set_title("'When were the blogs posted?'")

NameError: name 'pw' is not defined

In [23]:
ax900 = pw.Brick("ax900",figsize=(10,5))
sns.scatterplot(x=g9_df.reading_time_in_minutes,y=g9_df.claps_count, ax=ax900)
ax900.set_title("'Number of Claps vs Reading time of the article'")

NameError: name 'pw' is not defined

In [24]:
ax1000 = pw.Brick("ax1000",figsize=(10,5))
sns.barplot(x="quarter",y="count",hue="Year",data=g10_df_new, ax=ax1000)
ax1000.set_title("'QOQ Analysis of Published Articles'")

NameError: name 'pw' is not defined

In [25]:
ax_f = (ax100 | ax200) / (ax300 | ax400) / (ax500 | ax600) / (ax700 | ax800) / (ax900 | ax1000)
ax_f.savefig()

NameError: name 'ax100' is not defined